oceansweep commited on
Commit
b45812e
1 Parent(s): a913a97

Check two folders up instead of one

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Utils/Utils.py +615 -614
App_Function_Libraries/Utils/Utils.py CHANGED
@@ -1,614 +1,615 @@
1
- # Utils.py
2
- #########################################
3
- # General Utilities Library
4
- # This library is used to hold random utilities used by various other libraries.
5
- #
6
- ####
7
- ####################
8
- # Function List
9
- #
10
- # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
- # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
- # 3. verify_checksum(file_path, expected_checksum)
13
- # 4. create_download_directory(title)
14
- # 5. sanitize_filename(filename)
15
- # 6. normalize_title(title)
16
- # 7.
17
- #
18
- #
19
- #
20
- ####################
21
- # Import necessary libraries
22
- import configparser
23
- import hashlib
24
- import json
25
- import logging
26
- import os
27
- import re
28
- import time
29
- from datetime import timedelta
30
- from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
31
-
32
- import requests
33
- import unicodedata
34
- from tqdm import tqdm
35
-
36
- #######################################################################################################################
37
- # Function Definitions
38
- #
39
-
40
- def extract_text_from_segments(segments):
41
- logging.debug(f"Segments received: {segments}")
42
- logging.debug(f"Type of segments: {type(segments)}")
43
-
44
- def extract_text_recursive(data):
45
- if isinstance(data, dict):
46
- for key, value in data.items():
47
- if key == 'Text':
48
- return value
49
- elif isinstance(value, (dict, list)):
50
- result = extract_text_recursive(value)
51
- if result:
52
- return result
53
- elif isinstance(data, list):
54
- return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
55
- return None
56
-
57
- text = extract_text_recursive(segments)
58
-
59
- if text:
60
- return text.strip()
61
- else:
62
- logging.error(f"Unable to extract text from segments: {segments}")
63
- return "Error: Unable to extract transcription"
64
-
65
- def import_data(file):
66
- # Implement this function to import data from a file
67
- pass
68
-
69
- #
70
- #
71
- #######################
72
- # Temp file cleanup
73
- #
74
- # Global list to keep track of downloaded files
75
- downloaded_files = []
76
-
77
- def cleanup_downloads():
78
- """Function to clean up downloaded files when the server exits."""
79
- for file_path in downloaded_files:
80
- try:
81
- if os.path.exists(file_path):
82
- os.remove(file_path)
83
- print(f"Cleaned up file: {file_path}")
84
- except Exception as e:
85
- print(f"Error cleaning up file {file_path}: {e}")
86
-
87
- #
88
- #
89
- #######################################################################################################################
90
-
91
-
92
- #######################################################################################################################
93
- # Config loading
94
- #
95
-
96
- def load_comprehensive_config():
97
- # Get the directory of the current script
98
- current_dir = os.path.dirname(os.path.abspath(__file__))
99
- # Go up one level to the project root directory
100
- project_root = os.path.dirname(current_dir)
101
- # Construct the path to the config file in the project root directory
102
- config_path = os.path.join(project_root, 'config.txt')
103
- # Create a ConfigParser object
104
- config = configparser.ConfigParser()
105
- # Read the configuration file
106
- files_read = config.read(config_path)
107
- if not files_read:
108
- raise FileNotFoundError(f"Config file not found at {config_path}")
109
- return config
110
-
111
-
112
- # FIXME - update to include prompt path in return statement
113
- def load_and_log_configs():
114
- try:
115
- config = load_comprehensive_config()
116
- if config is None:
117
- logging.error("Config is None, cannot proceed")
118
- return None
119
- # API Keys
120
- anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
121
- logging.debug(
122
- f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
123
-
124
- cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
125
- logging.debug(
126
- f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
127
-
128
- groq_api_key = config.get('API', 'groq_api_key', fallback=None)
129
- logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
130
-
131
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
132
- logging.debug(
133
- f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
134
-
135
- huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
136
- logging.debug(
137
- f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
138
-
139
- openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
140
- logging.debug(
141
- f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
142
-
143
- deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
144
- logging.debug(
145
- f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
146
-
147
- mistral_api_key = config.get('API', 'mistral_api_key', fallback=None)
148
- logging.debug(
149
- f"Loaded Mistral API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
150
-
151
- # Models
152
- anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
153
- cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
154
- groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
155
- openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
156
- huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
157
- openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
158
- deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
159
- mistral_model = config.get('API', 'mistral_model', fallback='mistral-large-latest')
160
-
161
- logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
162
- logging.debug(f"Loaded Cohere Model: {cohere_model}")
163
- logging.debug(f"Loaded Groq Model: {groq_model}")
164
- logging.debug(f"Loaded OpenAI Model: {openai_model}")
165
- logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
166
- logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
167
- logging.debug(f"Loaded Deepseek Model: {deepseek_model}")
168
- logging.debug(f"Loaded Mistral Model: {mistral_model}")
169
-
170
- # Local-Models
171
- kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
172
- kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
173
-
174
- llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
175
- llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
176
-
177
- ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
178
- ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
179
-
180
- tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
181
- tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
182
- tabby_model = config.get('models', 'tabby_model', fallback=None)
183
-
184
- vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
185
- vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
186
- vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
187
-
188
- ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
189
- ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
190
- ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
191
-
192
- aphrodite_api_url = config.get('Local-API', 'aphrodite_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
193
- aphrodite_api_key = config.get('Local-API', 'aphrodite_api_key', fallback='')
194
-
195
- logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
196
- logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
197
- logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
198
- logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
199
- logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
200
-
201
- # Retrieve output paths from the configuration file
202
- output_path = config.get('Paths', 'output_path', fallback='results')
203
- logging.debug(f"Output path set to: {output_path}")
204
-
205
- # Retrieve processing choice from the configuration file
206
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
207
- logging.debug(f"Processing choice set to: {processing_choice}")
208
-
209
- # Prompts - FIXME
210
- prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
211
-
212
- return {
213
- 'api_keys': {
214
- 'anthropic': anthropic_api_key,
215
- 'cohere': cohere_api_key,
216
- 'groq': groq_api_key,
217
- 'openai': openai_api_key,
218
- 'huggingface': huggingface_api_key,
219
- 'openrouter': openrouter_api_key,
220
- 'deepseek': deepseek_api_key,
221
- 'mistral': mistral_api_key,
222
- 'kobold': kobold_api_key,
223
- 'llama': llama_api_key,
224
- 'ooba': ooba_api_key,
225
- 'tabby': tabby_api_key,
226
- 'vllm': vllm_api_key,
227
- 'ollama': ollama_api_key
228
- },
229
- 'models': {
230
- 'anthropic': anthropic_model,
231
- 'cohere': cohere_model,
232
- 'groq': groq_model,
233
- 'openai': openai_model,
234
- 'huggingface': huggingface_model,
235
- 'openrouter': openrouter_model,
236
- 'deepseek': deepseek_model,
237
- 'mistral': mistral_model,
238
- 'vllm': vllm_model,
239
- 'tabby': tabby_model,
240
- 'ollama': ollama_model
241
-
242
- },
243
- 'local_api_ip': {
244
- 'kobold': kobold_api_ip,
245
- 'llama': llama_api_IP,
246
- 'ooba': ooba_api_IP,
247
- 'tabby': tabby_api_IP,
248
- 'vllm': vllm_api_url,
249
- 'ollama': ollama_api_url,
250
- 'aphrodite': aphrodite_api_url
251
- },
252
- 'output_path': output_path,
253
- 'processing_choice': processing_choice
254
- }
255
-
256
- except Exception as e:
257
- logging.error(f"Error loading config: {str(e)}")
258
- return None
259
-
260
- #
261
- # End of Config loading
262
- #######################################################################################################################
263
-
264
-
265
- #######################################################################################################################
266
- #
267
- # Prompt Handling Functions
268
-
269
-
270
-
271
- #
272
- # End of Prompt Handling Functions
273
- ### #############################################################################################################
274
-
275
- #######################################################################################################################
276
- #
277
- # Misc-Functions
278
-
279
- # Log file
280
- # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
281
-
282
- def format_metadata_as_text(metadata):
283
- if not metadata:
284
- return "No metadata available"
285
-
286
- formatted_text = "Video Metadata:\n"
287
- for key, value in metadata.items():
288
- if value is not None:
289
- if isinstance(value, list):
290
- # Join list items with commas
291
- formatted_value = ", ".join(str(item) for item in value)
292
- elif key == 'upload_date' and len(str(value)) == 8:
293
- # Format date as YYYY-MM-DD
294
- formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
295
- elif key in ['view_count', 'like_count']:
296
- # Format large numbers with commas
297
- formatted_value = f"{value:,}"
298
- elif key == 'duration':
299
- # Convert seconds to HH:MM:SS format
300
- hours, remainder = divmod(value, 3600)
301
- minutes, seconds = divmod(remainder, 60)
302
- formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
303
- else:
304
- formatted_value = str(value)
305
-
306
- formatted_text += f"{key.capitalize()}: {formatted_value}\n"
307
- return formatted_text.strip()
308
-
309
- # # Example usage:
310
- # example_metadata = {
311
- # 'title': 'Sample Video Title',
312
- # 'uploader': 'Channel Name',
313
- # 'upload_date': '20230615',
314
- # 'view_count': 1000000,
315
- # 'like_count': 50000,
316
- # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
317
- # 'tags': ['tag1', 'tag2', 'tag3'],
318
- # 'description': 'This is a sample video description.'
319
- # }
320
- #
321
- # print(format_metadata_as_text(example_metadata))
322
-
323
-
324
- def convert_to_seconds(time_str):
325
- if not time_str:
326
- return 0
327
-
328
- # If it's already a number, assume it's in seconds
329
- if time_str.isdigit():
330
- return int(time_str)
331
-
332
- # Parse time string in format HH:MM:SS, MM:SS, or SS
333
- time_parts = time_str.split(':')
334
- if len(time_parts) == 3:
335
- return int(timedelta(hours=int(time_parts[0]),
336
- minutes=int(time_parts[1]),
337
- seconds=int(time_parts[2])).total_seconds())
338
- elif len(time_parts) == 2:
339
- return int(timedelta(minutes=int(time_parts[0]),
340
- seconds=int(time_parts[1])).total_seconds())
341
- elif len(time_parts) == 1:
342
- return int(time_parts[0])
343
- else:
344
- raise ValueError(f"Invalid time format: {time_str}")
345
-
346
- #
347
- # End of Misc-Functions
348
- #######################################################################################################################
349
-
350
-
351
- #######################################################################################################################
352
- #
353
- # File-saving Function Definitions
354
- def save_to_file(video_urls, filename):
355
- with open(filename, 'w') as file:
356
- file.write('\n'.join(video_urls))
357
- print(f"Video URLs saved to {filename}")
358
-
359
-
360
- def save_segments_to_json(segments, file_name="transcription_segments.json"):
361
- """
362
- Save transcription segments to a JSON file.
363
-
364
- Parameters:
365
- segments (list): List of transcription segments
366
- file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
367
-
368
- Returns:
369
- str: Path to the saved JSON file
370
- """
371
- # Ensure the Results directory exists
372
- os.makedirs("Results", exist_ok=True)
373
-
374
- # Full path for the JSON file
375
- json_file_path = os.path.join("Results", file_name)
376
-
377
- # Save segments to JSON file
378
- with open(json_file_path, 'w', encoding='utf-8') as json_file:
379
- json.dump(segments, json_file, ensure_ascii=False, indent=4)
380
-
381
- return json_file_path
382
-
383
-
384
- def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
385
- temp_path = dest_path + '.tmp'
386
-
387
- for attempt in range(max_retries):
388
- try:
389
- # Check if a partial download exists and get its size
390
- resume_header = {}
391
- if os.path.exists(temp_path):
392
- resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
393
-
394
- response = requests.get(url, stream=True, headers=resume_header)
395
- response.raise_for_status()
396
-
397
- # Get the total file size from headers
398
- total_size = int(response.headers.get('content-length', 0))
399
- initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
400
-
401
- mode = 'ab' if 'Range' in response.headers else 'wb'
402
- with open(temp_path, mode) as temp_file, tqdm(
403
- total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
404
- ) as pbar:
405
- for chunk in response.iter_content(chunk_size=8192):
406
- if chunk: # filter out keep-alive new chunks
407
- temp_file.write(chunk)
408
- pbar.update(len(chunk))
409
-
410
- # Verify the checksum if provided
411
- if expected_checksum:
412
- if not verify_checksum(temp_path, expected_checksum):
413
- os.remove(temp_path)
414
- raise ValueError("Downloaded file's checksum does not match the expected checksum")
415
-
416
- # Move the file to the final destination
417
- os.rename(temp_path, dest_path)
418
- print("Download complete and verified!")
419
- return dest_path
420
-
421
- except Exception as e:
422
- print(f"Attempt {attempt + 1} failed: {e}")
423
- if attempt < max_retries - 1:
424
- print(f"Retrying in {delay} seconds...")
425
- time.sleep(delay)
426
- else:
427
- print("Max retries reached. Download failed.")
428
- raise
429
-
430
- def create_download_directory(title):
431
- base_dir = "Results"
432
- # Remove characters that are illegal in Windows filenames and normalize
433
- safe_title = normalize_title(title)
434
- logging.debug(f"{title} successfully normalized")
435
- session_path = os.path.join(base_dir, safe_title)
436
- if not os.path.exists(session_path):
437
- os.makedirs(session_path, exist_ok=True)
438
- logging.debug(f"Created directory for downloaded video: {session_path}")
439
- else:
440
- logging.debug(f"Directory already exists for downloaded video: {session_path}")
441
- return session_path
442
-
443
-
444
- def safe_read_file(file_path):
445
- encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
446
- for encoding in encodings:
447
- try:
448
- with open(file_path, 'r', encoding=encoding) as file:
449
- return file.read()
450
- except UnicodeDecodeError:
451
- continue
452
- except FileNotFoundError:
453
- return f"File not found: {file_path}"
454
- except Exception as e:
455
- return f"An error occurred: {e}"
456
- return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
457
-
458
- #
459
- # End of Files-saving Function Definitions
460
- #######################################################################################################################
461
-
462
-
463
- #######################################################################################################################
464
- #
465
- # UUID-Functions
466
-
467
- def generate_unique_filename(base_path, base_filename):
468
- """Generate a unique filename by appending a counter if necessary."""
469
- filename = base_filename
470
- counter = 1
471
- while os.path.exists(os.path.join(base_path, filename)):
472
- name, ext = os.path.splitext(base_filename)
473
- filename = f"{name}_{counter}{ext}"
474
- counter += 1
475
- return filename
476
-
477
-
478
- def generate_unique_identifier(file_path):
479
- filename = os.path.basename(file_path)
480
- timestamp = int(time.time())
481
-
482
- # Generate a hash of the file content
483
- hasher = hashlib.md5()
484
- with open(file_path, 'rb') as f:
485
- buf = f.read()
486
- hasher.update(buf)
487
- content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
488
-
489
- return f"local:{timestamp}:{content_hash}:{filename}"
490
-
491
- #
492
- # End of UUID-Functions
493
- #######################################################################################################################
494
-
495
-
496
- #######################################################################################################################
497
- #
498
- # Backup code
499
-
500
- #
501
- # End of backup code
502
- #######################################################################################################################
503
-
504
-
505
- #######################################################################################################################
506
- #
507
- # Sanitization/Verification Functions
508
-
509
- # Helper function to validate URL format
510
- def is_valid_url(url: str) -> bool:
511
- regex = re.compile(
512
- r'^(?:http|ftp)s?://' # http:// or https://
513
- r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
514
- r'localhost|' # localhost...
515
- r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
516
- r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
517
- r'(?::\d+)?' # optional port
518
- r'(?:/?|[/?]\S+)$', re.IGNORECASE)
519
- return re.match(regex, url) is not None
520
-
521
-
522
- def verify_checksum(file_path, expected_checksum):
523
- sha256_hash = hashlib.sha256()
524
- with open(file_path, 'rb') as f:
525
- for byte_block in iter(lambda: f.read(4096), b''):
526
- sha256_hash.update(byte_block)
527
- return sha256_hash.hexdigest() == expected_checksum
528
-
529
-
530
- def normalize_title(title):
531
- # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
532
- title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
533
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
534
- '').replace(
535
- '<', '').replace('>', '').replace('|', '')
536
- return title
537
-
538
-
539
- def clean_youtube_url(url):
540
- parsed_url = urlparse(url)
541
- query_params = parse_qs(parsed_url.query)
542
- if 'list' in query_params:
543
- query_params.pop('list')
544
- cleaned_query = urlencode(query_params, doseq=True)
545
- cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
546
- return cleaned_url
547
-
548
- def sanitize_filename(filename):
549
- # Remove invalid characters and replace spaces with underscores
550
- sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
551
- sanitized = re.sub(r'\s+', ' ', sanitized).strip()
552
- return sanitized
553
-
554
-
555
- def format_transcription(content):
556
- # Replace '\n' with actual line breaks
557
- content = content.replace('\\n', '\n')
558
- # Split the content by newlines first
559
- lines = content.split('\n')
560
- formatted_lines = []
561
- for line in lines:
562
- # Add extra space after periods for better readability
563
- line = line.replace('.', '. ').replace('. ', '. ')
564
-
565
- # Split into sentences using a more comprehensive regex
566
- sentences = re.split('(?<=[.!?]) +', line)
567
-
568
- # Trim whitespace from each sentence and add a line break
569
- formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
570
-
571
- # Join the formatted sentences
572
- formatted_lines.append(' '.join(formatted_sentences))
573
-
574
- # Join the lines with HTML line breaks
575
- formatted_content = '<br>'.join(formatted_lines)
576
-
577
- return formatted_content
578
-
579
-
580
- def format_file_path(file_path, fallback_path=None):
581
- if file_path and os.path.exists(file_path):
582
- logging.debug(f"File exists: {file_path}")
583
- return file_path
584
- elif fallback_path and os.path.exists(fallback_path):
585
- logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
586
- return fallback_path
587
- else:
588
- logging.debug(f"File does not exist: {file_path}. No fallback path available.")
589
- return None
590
-
591
- #
592
- # End of Sanitization/Verification Functions
593
- #######################################################################################################################
594
-
595
-
596
- #######################################################################################################################
597
- #
598
- # DB Config Loading
599
-
600
-
601
- def get_db_config():
602
- config = configparser.ConfigParser()
603
- config.read('config.txt')
604
- return {
605
- 'type': config['Database']['type'],
606
- 'sqlite_path': config.get('Database', 'sqlite_path', fallback='media_summary.db'),
607
- 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
608
- 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
609
- }
610
-
611
-
612
- #
613
- # End of DB Config Loading
614
- #######################################################################################################################
 
 
1
+ # Utils.py
2
+ #########################################
3
+ # General Utilities Library
4
+ # This library is used to hold random utilities used by various other libraries.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
+ # 3. verify_checksum(file_path, expected_checksum)
13
+ # 4. create_download_directory(title)
14
+ # 5. sanitize_filename(filename)
15
+ # 6. normalize_title(title)
16
+ # 7.
17
+ #
18
+ #
19
+ #
20
+ ####################
21
+ # Import necessary libraries
22
+ import configparser
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import os
27
+ import re
28
+ import time
29
+ from datetime import timedelta
30
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
31
+
32
+ import requests
33
+ import unicodedata
34
+ from tqdm import tqdm
35
+
36
+ #######################################################################################################################
37
+ # Function Definitions
38
+ #
39
+
40
+ def extract_text_from_segments(segments):
41
+ logging.debug(f"Segments received: {segments}")
42
+ logging.debug(f"Type of segments: {type(segments)}")
43
+
44
+ def extract_text_recursive(data):
45
+ if isinstance(data, dict):
46
+ for key, value in data.items():
47
+ if key == 'Text':
48
+ return value
49
+ elif isinstance(value, (dict, list)):
50
+ result = extract_text_recursive(value)
51
+ if result:
52
+ return result
53
+ elif isinstance(data, list):
54
+ return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
55
+ return None
56
+
57
+ text = extract_text_recursive(segments)
58
+
59
+ if text:
60
+ return text.strip()
61
+ else:
62
+ logging.error(f"Unable to extract text from segments: {segments}")
63
+ return "Error: Unable to extract transcription"
64
+
65
+ def import_data(file):
66
+ # Implement this function to import data from a file
67
+ pass
68
+
69
+ #
70
+ #
71
+ #######################
72
+ # Temp file cleanup
73
+ #
74
+ # Global list to keep track of downloaded files
75
+ downloaded_files = []
76
+
77
+ def cleanup_downloads():
78
+ """Function to clean up downloaded files when the server exits."""
79
+ for file_path in downloaded_files:
80
+ try:
81
+ if os.path.exists(file_path):
82
+ os.remove(file_path)
83
+ print(f"Cleaned up file: {file_path}")
84
+ except Exception as e:
85
+ print(f"Error cleaning up file {file_path}: {e}")
86
+
87
+ #
88
+ #
89
+ #######################################################################################################################
90
+
91
+
92
+ #######################################################################################################################
93
+ # Config loading
94
+ #
95
+
96
+
97
+ def load_comprehensive_config():
98
+ # Get the directory of the current script
99
+ current_dir = os.path.dirname(os.path.abspath(__file__))
100
+ # Go up two levels to the project root directory
101
+ project_root = os.path.dirname(os.path.dirname(current_dir))
102
+ # Construct the path to the config file in the project root directory
103
+ config_path = os.path.join(project_root, 'config.txt')
104
+ # Create a ConfigParser object
105
+ config = configparser.ConfigParser()
106
+ # Read the configuration file
107
+ files_read = config.read(config_path)
108
+ if not files_read:
109
+ raise FileNotFoundError(f"Config file not found at {config_path}")
110
+ return config
111
+
112
+
113
+ # FIXME - update to include prompt path in return statement
114
+ def load_and_log_configs():
115
+ try:
116
+ config = load_comprehensive_config()
117
+ if config is None:
118
+ logging.error("Config is None, cannot proceed")
119
+ return None
120
+ # API Keys
121
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
122
+ logging.debug(
123
+ f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
124
+
125
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
126
+ logging.debug(
127
+ f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
128
+
129
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
130
+ logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
131
+
132
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
133
+ logging.debug(
134
+ f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
135
+
136
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
137
+ logging.debug(
138
+ f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
139
+
140
+ openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
141
+ logging.debug(
142
+ f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
143
+
144
+ deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
145
+ logging.debug(
146
+ f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
147
+
148
+ mistral_api_key = config.get('API', 'mistral_api_key', fallback=None)
149
+ logging.debug(
150
+ f"Loaded Mistral API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
151
+
152
+ # Models
153
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
154
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
155
+ groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
156
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
157
+ huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
158
+ openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
159
+ deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
160
+ mistral_model = config.get('API', 'mistral_model', fallback='mistral-large-latest')
161
+
162
+ logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
163
+ logging.debug(f"Loaded Cohere Model: {cohere_model}")
164
+ logging.debug(f"Loaded Groq Model: {groq_model}")
165
+ logging.debug(f"Loaded OpenAI Model: {openai_model}")
166
+ logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
167
+ logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
168
+ logging.debug(f"Loaded Deepseek Model: {deepseek_model}")
169
+ logging.debug(f"Loaded Mistral Model: {mistral_model}")
170
+
171
+ # Local-Models
172
+ kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
173
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
174
+
175
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
176
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
177
+
178
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
179
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
180
+
181
+ tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
182
+ tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
183
+ tabby_model = config.get('models', 'tabby_model', fallback=None)
184
+
185
+ vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
186
+ vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
187
+ vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
188
+
189
+ ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
190
+ ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
191
+ ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
192
+
193
+ aphrodite_api_url = config.get('Local-API', 'aphrodite_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
194
+ aphrodite_api_key = config.get('Local-API', 'aphrodite_api_key', fallback='')
195
+
196
+ logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
197
+ logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
198
+ logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
199
+ logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
200
+ logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
201
+
202
+ # Retrieve output paths from the configuration file
203
+ output_path = config.get('Paths', 'output_path', fallback='results')
204
+ logging.debug(f"Output path set to: {output_path}")
205
+
206
+ # Retrieve processing choice from the configuration file
207
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
208
+ logging.debug(f"Processing choice set to: {processing_choice}")
209
+
210
+ # Prompts - FIXME
211
+ prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
212
+
213
+ return {
214
+ 'api_keys': {
215
+ 'anthropic': anthropic_api_key,
216
+ 'cohere': cohere_api_key,
217
+ 'groq': groq_api_key,
218
+ 'openai': openai_api_key,
219
+ 'huggingface': huggingface_api_key,
220
+ 'openrouter': openrouter_api_key,
221
+ 'deepseek': deepseek_api_key,
222
+ 'mistral': mistral_api_key,
223
+ 'kobold': kobold_api_key,
224
+ 'llama': llama_api_key,
225
+ 'ooba': ooba_api_key,
226
+ 'tabby': tabby_api_key,
227
+ 'vllm': vllm_api_key,
228
+ 'ollama': ollama_api_key
229
+ },
230
+ 'models': {
231
+ 'anthropic': anthropic_model,
232
+ 'cohere': cohere_model,
233
+ 'groq': groq_model,
234
+ 'openai': openai_model,
235
+ 'huggingface': huggingface_model,
236
+ 'openrouter': openrouter_model,
237
+ 'deepseek': deepseek_model,
238
+ 'mistral': mistral_model,
239
+ 'vllm': vllm_model,
240
+ 'tabby': tabby_model,
241
+ 'ollama': ollama_model
242
+
243
+ },
244
+ 'local_api_ip': {
245
+ 'kobold': kobold_api_ip,
246
+ 'llama': llama_api_IP,
247
+ 'ooba': ooba_api_IP,
248
+ 'tabby': tabby_api_IP,
249
+ 'vllm': vllm_api_url,
250
+ 'ollama': ollama_api_url,
251
+ 'aphrodite': aphrodite_api_url
252
+ },
253
+ 'output_path': output_path,
254
+ 'processing_choice': processing_choice
255
+ }
256
+
257
+ except Exception as e:
258
+ logging.error(f"Error loading config: {str(e)}")
259
+ return None
260
+
261
+ #
262
+ # End of Config loading
263
+ #######################################################################################################################
264
+
265
+
266
+ #######################################################################################################################
267
+ #
268
+ # Prompt Handling Functions
269
+
270
+
271
+
272
+ #
273
+ # End of Prompt Handling Functions
274
+ ### #############################################################################################################
275
+
276
+ #######################################################################################################################
277
+ #
278
+ # Misc-Functions
279
+
280
+ # Log file
281
+ # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
282
+
283
+ def format_metadata_as_text(metadata):
284
+ if not metadata:
285
+ return "No metadata available"
286
+
287
+ formatted_text = "Video Metadata:\n"
288
+ for key, value in metadata.items():
289
+ if value is not None:
290
+ if isinstance(value, list):
291
+ # Join list items with commas
292
+ formatted_value = ", ".join(str(item) for item in value)
293
+ elif key == 'upload_date' and len(str(value)) == 8:
294
+ # Format date as YYYY-MM-DD
295
+ formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
296
+ elif key in ['view_count', 'like_count']:
297
+ # Format large numbers with commas
298
+ formatted_value = f"{value:,}"
299
+ elif key == 'duration':
300
+ # Convert seconds to HH:MM:SS format
301
+ hours, remainder = divmod(value, 3600)
302
+ minutes, seconds = divmod(remainder, 60)
303
+ formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
304
+ else:
305
+ formatted_value = str(value)
306
+
307
+ formatted_text += f"{key.capitalize()}: {formatted_value}\n"
308
+ return formatted_text.strip()
309
+
310
+ # # Example usage:
311
+ # example_metadata = {
312
+ # 'title': 'Sample Video Title',
313
+ # 'uploader': 'Channel Name',
314
+ # 'upload_date': '20230615',
315
+ # 'view_count': 1000000,
316
+ # 'like_count': 50000,
317
+ # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
318
+ # 'tags': ['tag1', 'tag2', 'tag3'],
319
+ # 'description': 'This is a sample video description.'
320
+ # }
321
+ #
322
+ # print(format_metadata_as_text(example_metadata))
323
+
324
+
325
+ def convert_to_seconds(time_str):
326
+ if not time_str:
327
+ return 0
328
+
329
+ # If it's already a number, assume it's in seconds
330
+ if time_str.isdigit():
331
+ return int(time_str)
332
+
333
+ # Parse time string in format HH:MM:SS, MM:SS, or SS
334
+ time_parts = time_str.split(':')
335
+ if len(time_parts) == 3:
336
+ return int(timedelta(hours=int(time_parts[0]),
337
+ minutes=int(time_parts[1]),
338
+ seconds=int(time_parts[2])).total_seconds())
339
+ elif len(time_parts) == 2:
340
+ return int(timedelta(minutes=int(time_parts[0]),
341
+ seconds=int(time_parts[1])).total_seconds())
342
+ elif len(time_parts) == 1:
343
+ return int(time_parts[0])
344
+ else:
345
+ raise ValueError(f"Invalid time format: {time_str}")
346
+
347
+ #
348
+ # End of Misc-Functions
349
+ #######################################################################################################################
350
+
351
+
352
+ #######################################################################################################################
353
+ #
354
+ # File-saving Function Definitions
355
+ def save_to_file(video_urls, filename):
356
+ with open(filename, 'w') as file:
357
+ file.write('\n'.join(video_urls))
358
+ print(f"Video URLs saved to {filename}")
359
+
360
+
361
+ def save_segments_to_json(segments, file_name="transcription_segments.json"):
362
+ """
363
+ Save transcription segments to a JSON file.
364
+
365
+ Parameters:
366
+ segments (list): List of transcription segments
367
+ file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
368
+
369
+ Returns:
370
+ str: Path to the saved JSON file
371
+ """
372
+ # Ensure the Results directory exists
373
+ os.makedirs("Results", exist_ok=True)
374
+
375
+ # Full path for the JSON file
376
+ json_file_path = os.path.join("Results", file_name)
377
+
378
+ # Save segments to JSON file
379
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
380
+ json.dump(segments, json_file, ensure_ascii=False, indent=4)
381
+
382
+ return json_file_path
383
+
384
+
385
+ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
386
+ temp_path = dest_path + '.tmp'
387
+
388
+ for attempt in range(max_retries):
389
+ try:
390
+ # Check if a partial download exists and get its size
391
+ resume_header = {}
392
+ if os.path.exists(temp_path):
393
+ resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
394
+
395
+ response = requests.get(url, stream=True, headers=resume_header)
396
+ response.raise_for_status()
397
+
398
+ # Get the total file size from headers
399
+ total_size = int(response.headers.get('content-length', 0))
400
+ initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
401
+
402
+ mode = 'ab' if 'Range' in response.headers else 'wb'
403
+ with open(temp_path, mode) as temp_file, tqdm(
404
+ total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
405
+ ) as pbar:
406
+ for chunk in response.iter_content(chunk_size=8192):
407
+ if chunk: # filter out keep-alive new chunks
408
+ temp_file.write(chunk)
409
+ pbar.update(len(chunk))
410
+
411
+ # Verify the checksum if provided
412
+ if expected_checksum:
413
+ if not verify_checksum(temp_path, expected_checksum):
414
+ os.remove(temp_path)
415
+ raise ValueError("Downloaded file's checksum does not match the expected checksum")
416
+
417
+ # Move the file to the final destination
418
+ os.rename(temp_path, dest_path)
419
+ print("Download complete and verified!")
420
+ return dest_path
421
+
422
+ except Exception as e:
423
+ print(f"Attempt {attempt + 1} failed: {e}")
424
+ if attempt < max_retries - 1:
425
+ print(f"Retrying in {delay} seconds...")
426
+ time.sleep(delay)
427
+ else:
428
+ print("Max retries reached. Download failed.")
429
+ raise
430
+
431
+ def create_download_directory(title):
432
+ base_dir = "Results"
433
+ # Remove characters that are illegal in Windows filenames and normalize
434
+ safe_title = normalize_title(title)
435
+ logging.debug(f"{title} successfully normalized")
436
+ session_path = os.path.join(base_dir, safe_title)
437
+ if not os.path.exists(session_path):
438
+ os.makedirs(session_path, exist_ok=True)
439
+ logging.debug(f"Created directory for downloaded video: {session_path}")
440
+ else:
441
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
442
+ return session_path
443
+
444
+
445
+ def safe_read_file(file_path):
446
+ encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
447
+ for encoding in encodings:
448
+ try:
449
+ with open(file_path, 'r', encoding=encoding) as file:
450
+ return file.read()
451
+ except UnicodeDecodeError:
452
+ continue
453
+ except FileNotFoundError:
454
+ return f"File not found: {file_path}"
455
+ except Exception as e:
456
+ return f"An error occurred: {e}"
457
+ return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
458
+
459
+ #
460
+ # End of Files-saving Function Definitions
461
+ #######################################################################################################################
462
+
463
+
464
+ #######################################################################################################################
465
+ #
466
+ # UUID-Functions
467
+
468
+ def generate_unique_filename(base_path, base_filename):
469
+ """Generate a unique filename by appending a counter if necessary."""
470
+ filename = base_filename
471
+ counter = 1
472
+ while os.path.exists(os.path.join(base_path, filename)):
473
+ name, ext = os.path.splitext(base_filename)
474
+ filename = f"{name}_{counter}{ext}"
475
+ counter += 1
476
+ return filename
477
+
478
+
479
+ def generate_unique_identifier(file_path):
480
+ filename = os.path.basename(file_path)
481
+ timestamp = int(time.time())
482
+
483
+ # Generate a hash of the file content
484
+ hasher = hashlib.md5()
485
+ with open(file_path, 'rb') as f:
486
+ buf = f.read()
487
+ hasher.update(buf)
488
+ content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
489
+
490
+ return f"local:{timestamp}:{content_hash}:{filename}"
491
+
492
+ #
493
+ # End of UUID-Functions
494
+ #######################################################################################################################
495
+
496
+
497
+ #######################################################################################################################
498
+ #
499
+ # Backup code
500
+
501
+ #
502
+ # End of backup code
503
+ #######################################################################################################################
504
+
505
+
506
+ #######################################################################################################################
507
+ #
508
+ # Sanitization/Verification Functions
509
+
510
+ # Helper function to validate URL format
511
+ def is_valid_url(url: str) -> bool:
512
+ regex = re.compile(
513
+ r'^(?:http|ftp)s?://' # http:// or https://
514
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
515
+ r'localhost|' # localhost...
516
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
517
+ r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
518
+ r'(?::\d+)?' # optional port
519
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
520
+ return re.match(regex, url) is not None
521
+
522
+
523
+ def verify_checksum(file_path, expected_checksum):
524
+ sha256_hash = hashlib.sha256()
525
+ with open(file_path, 'rb') as f:
526
+ for byte_block in iter(lambda: f.read(4096), b''):
527
+ sha256_hash.update(byte_block)
528
+ return sha256_hash.hexdigest() == expected_checksum
529
+
530
+
531
+ def normalize_title(title):
532
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
533
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
534
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
535
+ '').replace(
536
+ '<', '').replace('>', '').replace('|', '')
537
+ return title
538
+
539
+
540
+ def clean_youtube_url(url):
541
+ parsed_url = urlparse(url)
542
+ query_params = parse_qs(parsed_url.query)
543
+ if 'list' in query_params:
544
+ query_params.pop('list')
545
+ cleaned_query = urlencode(query_params, doseq=True)
546
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
547
+ return cleaned_url
548
+
549
+ def sanitize_filename(filename):
550
+ # Remove invalid characters and replace spaces with underscores
551
+ sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
552
+ sanitized = re.sub(r'\s+', ' ', sanitized).strip()
553
+ return sanitized
554
+
555
+
556
+ def format_transcription(content):
557
+ # Replace '\n' with actual line breaks
558
+ content = content.replace('\\n', '\n')
559
+ # Split the content by newlines first
560
+ lines = content.split('\n')
561
+ formatted_lines = []
562
+ for line in lines:
563
+ # Add extra space after periods for better readability
564
+ line = line.replace('.', '. ').replace('. ', '. ')
565
+
566
+ # Split into sentences using a more comprehensive regex
567
+ sentences = re.split('(?<=[.!?]) +', line)
568
+
569
+ # Trim whitespace from each sentence and add a line break
570
+ formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
571
+
572
+ # Join the formatted sentences
573
+ formatted_lines.append(' '.join(formatted_sentences))
574
+
575
+ # Join the lines with HTML line breaks
576
+ formatted_content = '<br>'.join(formatted_lines)
577
+
578
+ return formatted_content
579
+
580
+
581
+ def format_file_path(file_path, fallback_path=None):
582
+ if file_path and os.path.exists(file_path):
583
+ logging.debug(f"File exists: {file_path}")
584
+ return file_path
585
+ elif fallback_path and os.path.exists(fallback_path):
586
+ logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
587
+ return fallback_path
588
+ else:
589
+ logging.debug(f"File does not exist: {file_path}. No fallback path available.")
590
+ return None
591
+
592
+ #
593
+ # End of Sanitization/Verification Functions
594
+ #######################################################################################################################
595
+
596
+
597
+ #######################################################################################################################
598
+ #
599
+ # DB Config Loading
600
+
601
+
602
+ def get_db_config():
603
+ config = configparser.ConfigParser()
604
+ config.read('config.txt')
605
+ return {
606
+ 'type': config['Database']['type'],
607
+ 'sqlite_path': config.get('Database', 'sqlite_path', fallback='media_summary.db'),
608
+ 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
609
+ 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
610
+ }
611
+
612
+
613
+ #
614
+ # End of DB Config Loading
615
+ #######################################################################################################################