oceansweep commited on
Commit
451d270
1 Parent(s): cce77c2

Upload 3 files

Browse files
App_Function_Libraries/Utils/System_Checks_Lib.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # System_Checks_Lib.py
2
+ #########################################
3
+ # System Checks Library
4
+ # This library is used to check the system for the necessary dependencies to run the script.
5
+ # It checks for the OS, the availability of the GPU, and the availability of the ffmpeg executable.
6
+ # If the GPU is available, it asks the user if they would like to use it for processing.
7
+ # If ffmpeg is not found, it asks the user if they would like to download it.
8
+ # The script will exit if the user chooses not to download ffmpeg.
9
+ ####
10
+
11
+ ####################
12
+ # Function List
13
+ #
14
+ # 1. platform_check()
15
+ # 2. cuda_check()
16
+ # 3. decide_cpugpu()
17
+ # 4. check_ffmpeg()
18
+ # 5. download_ffmpeg()
19
+ #
20
+ ####################
21
+
22
+
23
+
24
+
25
+ # Import necessary libraries
26
+ import logging
27
+ import os
28
+ import platform
29
+ import requests
30
+ import shutil
31
+ import subprocess
32
+ import zipfile
33
+ # Import Local Libraries
34
+ #from App_Function_Libraries import
35
+ #
36
+ #######################################################################################################################
37
+ # Function Definitions
38
+ #
39
+
40
+ def platform_check():
41
+ global userOS
42
+ if platform.system() == "Linux":
43
+ print("Linux OS detected \n Running Linux appropriate commands")
44
+ userOS = "Linux"
45
+ elif platform.system() == "Windows":
46
+ print("Windows OS detected \n Running Windows appropriate commands")
47
+ userOS = "Windows"
48
+ else:
49
+ print("Other OS detected \n Maybe try running things manually?")
50
+ exit()
51
+
52
+
53
+ # Check for NVIDIA GPU and CUDA availability
54
+ def cuda_check():
55
+ global processing_choice
56
+ try:
57
+ # Run nvidia-smi to capture its output
58
+ nvidia_smi_output = subprocess.check_output("nvidia-smi", shell=True).decode()
59
+
60
+ # Look for CUDA version in the output
61
+ if "CUDA Version" in nvidia_smi_output:
62
+ cuda_version = next(
63
+ (line.split(":")[-1].strip() for line in nvidia_smi_output.splitlines() if "CUDA Version" in line),
64
+ "Not found")
65
+ print(f"NVIDIA GPU with CUDA Version {cuda_version} is available.")
66
+ processing_choice = "cuda"
67
+ else:
68
+ print("CUDA is not installed or configured correctly.")
69
+ processing_choice = "cpu"
70
+
71
+ except subprocess.CalledProcessError as e:
72
+ print(f"Failed to run 'nvidia-smi': {str(e)}")
73
+ processing_choice = "cpu"
74
+ except Exception as e:
75
+ print(f"An error occurred: {str(e)}")
76
+ processing_choice = "cpu"
77
+
78
+ # Optionally, check for the CUDA_VISIBLE_DEVICES env variable as an additional check
79
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
80
+ print("CUDA_VISIBLE_DEVICES is set:", os.environ["CUDA_VISIBLE_DEVICES"])
81
+ else:
82
+ print("CUDA_VISIBLE_DEVICES not set.")
83
+
84
+
85
+ # Ask user if they would like to use either their GPU or their CPU for transcription
86
+ def decide_cpugpu():
87
+ global processing_choice
88
+ processing_input = input("Would you like to use your GPU or CPU for transcription? (1/cuda)GPU/(2/cpu)CPU): ")
89
+ if processing_choice == "cuda" and (processing_input.lower() == "cuda" or processing_input == "1"):
90
+ print("You've chosen to use the GPU.")
91
+ logging.debug("GPU is being used for processing")
92
+ processing_choice = "cuda"
93
+ elif processing_input.lower() == "cpu" or processing_input == "2":
94
+ print("You've chosen to use the CPU.")
95
+ logging.debug("CPU is being used for processing")
96
+ processing_choice = "cpu"
97
+ else:
98
+ print("Invalid choice. Please select either GPU or CPU.")
99
+
100
+
101
+ # check for existence of ffmpeg
102
+ def check_ffmpeg():
103
+ if shutil.which("ffmpeg") or (os.path.exists("Bin") and os.path.isfile(".\\Bin\\ffmpeg.exe")):
104
+ logging.debug("ffmpeg found installed on the local system, in the local PATH, or in the './Bin' folder")
105
+ pass
106
+ else:
107
+ logging.debug("ffmpeg not installed on the local system/in local PATH")
108
+ print(
109
+ "ffmpeg is not installed.\n\n You can either install it manually, or through your package manager of "
110
+ "choice.\n Windows users, builds are here: https://www.gyan.dev/ffmpeg/builds/")
111
+ if userOS == "Windows":
112
+ download_ffmpeg()
113
+ elif userOS == "Linux":
114
+ print(
115
+ "You should install ffmpeg using your platform's appropriate package manager, 'apt install ffmpeg',"
116
+ "'dnf install ffmpeg' or 'pacman', etc.")
117
+ else:
118
+ logging.debug("running an unsupported OS")
119
+ print("You're running an unspported/Un-tested OS")
120
+ exit_script = input("Let's exit the script, unless you're feeling lucky? (y/n)")
121
+ if exit_script == "y" or "yes" or "1":
122
+ exit()
123
+
124
+
125
+ # Download ffmpeg
126
+ def download_ffmpeg():
127
+ user_choice = input("Do you want to download ffmpeg? (y)Yes/(n)No: ")
128
+ if user_choice.lower() in ['yes', 'y', '1']:
129
+ print("Downloading ffmpeg")
130
+ url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
131
+ response = requests.get(url)
132
+
133
+ if response.status_code == 200:
134
+ print("Saving ffmpeg zip file")
135
+ logging.debug("Saving ffmpeg zip file")
136
+ zip_path = "ffmpeg-release-essentials.zip"
137
+ with open(zip_path, 'wb') as file:
138
+ file.write(response.content)
139
+
140
+ logging.debug("Extracting the 'ffmpeg.exe' file from the zip")
141
+ print("Extracting ffmpeg.exe from zip file to '/Bin' folder")
142
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
143
+ # Find the ffmpeg.exe file within the zip
144
+ ffmpeg_path = None
145
+ for file_info in zip_ref.infolist():
146
+ if file_info.filename.endswith("ffmpeg.exe"):
147
+ ffmpeg_path = file_info.filename
148
+ break
149
+
150
+ if ffmpeg_path is None:
151
+ logging.error("ffmpeg.exe not found in the zip file.")
152
+ print("ffmpeg.exe not found in the zip file.")
153
+ return
154
+
155
+ logging.debug("checking if the './Bin' folder exists, creating if not")
156
+ bin_folder = "Bin"
157
+ if not os.path.exists(bin_folder):
158
+ logging.debug("Creating a folder for './Bin', it didn't previously exist")
159
+ os.makedirs(bin_folder)
160
+
161
+ logging.debug("Extracting 'ffmpeg.exe' to the './Bin' folder")
162
+ zip_ref.extract(ffmpeg_path, path=bin_folder)
163
+
164
+ logging.debug("Moving 'ffmpeg.exe' to the './Bin' folder")
165
+ src_path = os.path.join(bin_folder, ffmpeg_path)
166
+ dst_path = os.path.join(bin_folder, "ffmpeg.exe")
167
+ shutil.move(src_path, dst_path)
168
+
169
+ logging.debug("Removing ffmpeg zip file")
170
+ print("Deleting zip file (we've already extracted ffmpeg.exe, no worries)")
171
+ os.remove(zip_path)
172
+
173
+ logging.debug("ffmpeg.exe has been downloaded and extracted to the './Bin' folder.")
174
+ print("ffmpeg.exe has been successfully downloaded and extracted to the './Bin' folder.")
175
+ else:
176
+ logging.error("Failed to download the zip file.")
177
+ print("Failed to download the zip file.")
178
+ else:
179
+ logging.debug("User chose to not download ffmpeg")
180
+ print("ffmpeg will not be downloaded.")
181
+
182
+ #
183
+ #
184
+ #######################################################################################################################
App_Function_Libraries/Utils/Utils.py ADDED
@@ -0,0 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils.py
2
+ #########################################
3
+ # General Utilities Library
4
+ # This library is used to hold random utilities used by various other libraries.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
+ # 3. verify_checksum(file_path, expected_checksum)
13
+ # 4. create_download_directory(title)
14
+ # 5. sanitize_filename(filename)
15
+ # 6. normalize_title(title)
16
+ # 7.
17
+ #
18
+ #
19
+ #
20
+ ####################
21
+ # Import necessary libraries
22
+ import configparser
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import os
27
+ import re
28
+ import time
29
+ from datetime import timedelta
30
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
31
+
32
+ import requests
33
+ import unicodedata
34
+ from tqdm import tqdm
35
+
36
+ #######################################################################################################################
37
+ # Function Definitions
38
+ #
39
+
40
+ def extract_text_from_segments(segments):
41
+ logging.debug(f"Segments received: {segments}")
42
+ logging.debug(f"Type of segments: {type(segments)}")
43
+
44
+ def extract_text_recursive(data):
45
+ if isinstance(data, dict):
46
+ for key, value in data.items():
47
+ if key == 'Text':
48
+ return value
49
+ elif isinstance(value, (dict, list)):
50
+ result = extract_text_recursive(value)
51
+ if result:
52
+ return result
53
+ elif isinstance(data, list):
54
+ return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
55
+ return None
56
+
57
+ text = extract_text_recursive(segments)
58
+
59
+ if text:
60
+ return text.strip()
61
+ else:
62
+ logging.error(f"Unable to extract text from segments: {segments}")
63
+ return "Error: Unable to extract transcription"
64
+
65
+ def import_data(file):
66
+ # Implement this function to import data from a file
67
+ pass
68
+
69
+ #
70
+ #
71
+ #######################
72
+ # Temp file cleanup
73
+ #
74
+ # Global list to keep track of downloaded files
75
+ downloaded_files = []
76
+
77
+ def cleanup_downloads():
78
+ """Function to clean up downloaded files when the server exits."""
79
+ for file_path in downloaded_files:
80
+ try:
81
+ if os.path.exists(file_path):
82
+ os.remove(file_path)
83
+ print(f"Cleaned up file: {file_path}")
84
+ except Exception as e:
85
+ print(f"Error cleaning up file {file_path}: {e}")
86
+
87
+ #
88
+ #
89
+ #######################################################################################################################
90
+
91
+
92
+ #######################################################################################################################
93
+ # Config loading
94
+ #
95
+
96
+ def load_comprehensive_config():
97
+ # Get the directory of the current script
98
+ current_dir = os.path.dirname(os.path.abspath(__file__))
99
+ # Go up one level to the project root directory
100
+ project_root = os.path.dirname(current_dir)
101
+ # Construct the path to the config file in the project root directory
102
+ config_path = os.path.join(project_root, 'config.txt')
103
+ # Create a ConfigParser object
104
+ config = configparser.ConfigParser()
105
+ # Read the configuration file
106
+ files_read = config.read(config_path)
107
+ if not files_read:
108
+ raise FileNotFoundError(f"Config file not found at {config_path}")
109
+ return config
110
+
111
+
112
+ # FIXME - update to include prompt path in return statement
113
+ def load_and_log_configs():
114
+ try:
115
+ config = load_comprehensive_config()
116
+ if config is None:
117
+ logging.error("Config is None, cannot proceed")
118
+ return None
119
+ # API Keys
120
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
121
+ logging.debug(
122
+ f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
123
+
124
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
125
+ logging.debug(
126
+ f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
127
+
128
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
129
+ logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
130
+
131
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
132
+ logging.debug(
133
+ f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
134
+
135
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
136
+ logging.debug(
137
+ f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
138
+
139
+ openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
140
+ logging.debug(
141
+ f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
142
+
143
+ deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
144
+ logging.debug(
145
+ f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
146
+
147
+ mistral_api_key = config.get('API', 'mistral_api_key', fallback=None)
148
+ logging.debug(
149
+ f"Loaded Mistral API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
150
+
151
+ # Models
152
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
153
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
154
+ groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
155
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
156
+ huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
157
+ openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
158
+ deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
159
+ mistral_model = config.get('API', 'mistral_model', fallback='mistral-large-latest')
160
+
161
+ logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
162
+ logging.debug(f"Loaded Cohere Model: {cohere_model}")
163
+ logging.debug(f"Loaded Groq Model: {groq_model}")
164
+ logging.debug(f"Loaded OpenAI Model: {openai_model}")
165
+ logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
166
+ logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
167
+ logging.debug(f"Loaded Deepseek Model: {deepseek_model}")
168
+ logging.debug(f"Loaded Mistral Model: {mistral_model}")
169
+
170
+ # Local-Models
171
+ kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
172
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
173
+
174
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
175
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
176
+
177
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
178
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
179
+
180
+ tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
181
+ tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
182
+ tabby_model = config.get('models', 'tabby_model', fallback=None)
183
+
184
+ vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
185
+ vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
186
+ vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
187
+
188
+ ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
189
+ ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
190
+ ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
191
+
192
+ aphrodite_api_url = config.get('Local-API', 'aphrodite_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
193
+ aphrodite_api_key = config.get('Local-API', 'aphrodite_api_key', fallback='')
194
+
195
+ logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
196
+ logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
197
+ logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
198
+ logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
199
+ logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
200
+
201
+ # Retrieve output paths from the configuration file
202
+ output_path = config.get('Paths', 'output_path', fallback='results')
203
+ logging.debug(f"Output path set to: {output_path}")
204
+
205
+ # Retrieve processing choice from the configuration file
206
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
207
+ logging.debug(f"Processing choice set to: {processing_choice}")
208
+
209
+ # Prompts - FIXME
210
+ prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
211
+
212
+ return {
213
+ 'api_keys': {
214
+ 'anthropic': anthropic_api_key,
215
+ 'cohere': cohere_api_key,
216
+ 'groq': groq_api_key,
217
+ 'openai': openai_api_key,
218
+ 'huggingface': huggingface_api_key,
219
+ 'openrouter': openrouter_api_key,
220
+ 'deepseek': deepseek_api_key,
221
+ 'mistral': mistral_api_key,
222
+ 'kobold': kobold_api_key,
223
+ 'llama': llama_api_key,
224
+ 'ooba': ooba_api_key,
225
+ 'tabby': tabby_api_key,
226
+ 'vllm': vllm_api_key,
227
+ 'ollama': ollama_api_key
228
+ },
229
+ 'models': {
230
+ 'anthropic': anthropic_model,
231
+ 'cohere': cohere_model,
232
+ 'groq': groq_model,
233
+ 'openai': openai_model,
234
+ 'huggingface': huggingface_model,
235
+ 'openrouter': openrouter_model,
236
+ 'deepseek': deepseek_model,
237
+ 'mistral': mistral_model,
238
+ 'vllm': vllm_model,
239
+ 'tabby': tabby_model,
240
+ 'ollama': ollama_model
241
+
242
+ },
243
+ 'local_api_ip': {
244
+ 'kobold': kobold_api_ip,
245
+ 'llama': llama_api_IP,
246
+ 'ooba': ooba_api_IP,
247
+ 'tabby': tabby_api_IP,
248
+ 'vllm': vllm_api_url,
249
+ 'ollama': ollama_api_url,
250
+ 'aphrodite': aphrodite_api_url
251
+ },
252
+ 'output_path': output_path,
253
+ 'processing_choice': processing_choice
254
+ }
255
+
256
+ except Exception as e:
257
+ logging.error(f"Error loading config: {str(e)}")
258
+ return None
259
+
260
+ #
261
+ # End of Config loading
262
+ #######################################################################################################################
263
+
264
+
265
+ #######################################################################################################################
266
+ #
267
+ # Prompt Handling Functions
268
+
269
+
270
+
271
+ #
272
+ # End of Prompt Handling Functions
273
+ ### #############################################################################################################
274
+
275
+ #######################################################################################################################
276
+ #
277
+ # Misc-Functions
278
+
279
+ # Log file
280
+ # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
281
+
282
+ def format_metadata_as_text(metadata):
283
+ if not metadata:
284
+ return "No metadata available"
285
+
286
+ formatted_text = "Video Metadata:\n"
287
+ for key, value in metadata.items():
288
+ if value is not None:
289
+ if isinstance(value, list):
290
+ # Join list items with commas
291
+ formatted_value = ", ".join(str(item) for item in value)
292
+ elif key == 'upload_date' and len(str(value)) == 8:
293
+ # Format date as YYYY-MM-DD
294
+ formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
295
+ elif key in ['view_count', 'like_count']:
296
+ # Format large numbers with commas
297
+ formatted_value = f"{value:,}"
298
+ elif key == 'duration':
299
+ # Convert seconds to HH:MM:SS format
300
+ hours, remainder = divmod(value, 3600)
301
+ minutes, seconds = divmod(remainder, 60)
302
+ formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
303
+ else:
304
+ formatted_value = str(value)
305
+
306
+ formatted_text += f"{key.capitalize()}: {formatted_value}\n"
307
+ return formatted_text.strip()
308
+
309
+ # # Example usage:
310
+ # example_metadata = {
311
+ # 'title': 'Sample Video Title',
312
+ # 'uploader': 'Channel Name',
313
+ # 'upload_date': '20230615',
314
+ # 'view_count': 1000000,
315
+ # 'like_count': 50000,
316
+ # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
317
+ # 'tags': ['tag1', 'tag2', 'tag3'],
318
+ # 'description': 'This is a sample video description.'
319
+ # }
320
+ #
321
+ # print(format_metadata_as_text(example_metadata))
322
+
323
+
324
+ def convert_to_seconds(time_str):
325
+ if not time_str:
326
+ return 0
327
+
328
+ # If it's already a number, assume it's in seconds
329
+ if time_str.isdigit():
330
+ return int(time_str)
331
+
332
+ # Parse time string in format HH:MM:SS, MM:SS, or SS
333
+ time_parts = time_str.split(':')
334
+ if len(time_parts) == 3:
335
+ return int(timedelta(hours=int(time_parts[0]),
336
+ minutes=int(time_parts[1]),
337
+ seconds=int(time_parts[2])).total_seconds())
338
+ elif len(time_parts) == 2:
339
+ return int(timedelta(minutes=int(time_parts[0]),
340
+ seconds=int(time_parts[1])).total_seconds())
341
+ elif len(time_parts) == 1:
342
+ return int(time_parts[0])
343
+ else:
344
+ raise ValueError(f"Invalid time format: {time_str}")
345
+
346
+ #
347
+ # End of Misc-Functions
348
+ #######################################################################################################################
349
+
350
+
351
+ #######################################################################################################################
352
+ #
353
+ # File-saving Function Definitions
354
+ def save_to_file(video_urls, filename):
355
+ with open(filename, 'w') as file:
356
+ file.write('\n'.join(video_urls))
357
+ print(f"Video URLs saved to {filename}")
358
+
359
+
360
+ def save_segments_to_json(segments, file_name="transcription_segments.json"):
361
+ """
362
+ Save transcription segments to a JSON file.
363
+
364
+ Parameters:
365
+ segments (list): List of transcription segments
366
+ file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
367
+
368
+ Returns:
369
+ str: Path to the saved JSON file
370
+ """
371
+ # Ensure the Results directory exists
372
+ os.makedirs("Results", exist_ok=True)
373
+
374
+ # Full path for the JSON file
375
+ json_file_path = os.path.join("Results", file_name)
376
+
377
+ # Save segments to JSON file
378
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
379
+ json.dump(segments, json_file, ensure_ascii=False, indent=4)
380
+
381
+ return json_file_path
382
+
383
+
384
+ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
385
+ temp_path = dest_path + '.tmp'
386
+
387
+ for attempt in range(max_retries):
388
+ try:
389
+ # Check if a partial download exists and get its size
390
+ resume_header = {}
391
+ if os.path.exists(temp_path):
392
+ resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
393
+
394
+ response = requests.get(url, stream=True, headers=resume_header)
395
+ response.raise_for_status()
396
+
397
+ # Get the total file size from headers
398
+ total_size = int(response.headers.get('content-length', 0))
399
+ initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
400
+
401
+ mode = 'ab' if 'Range' in response.headers else 'wb'
402
+ with open(temp_path, mode) as temp_file, tqdm(
403
+ total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
404
+ ) as pbar:
405
+ for chunk in response.iter_content(chunk_size=8192):
406
+ if chunk: # filter out keep-alive new chunks
407
+ temp_file.write(chunk)
408
+ pbar.update(len(chunk))
409
+
410
+ # Verify the checksum if provided
411
+ if expected_checksum:
412
+ if not verify_checksum(temp_path, expected_checksum):
413
+ os.remove(temp_path)
414
+ raise ValueError("Downloaded file's checksum does not match the expected checksum")
415
+
416
+ # Move the file to the final destination
417
+ os.rename(temp_path, dest_path)
418
+ print("Download complete and verified!")
419
+ return dest_path
420
+
421
+ except Exception as e:
422
+ print(f"Attempt {attempt + 1} failed: {e}")
423
+ if attempt < max_retries - 1:
424
+ print(f"Retrying in {delay} seconds...")
425
+ time.sleep(delay)
426
+ else:
427
+ print("Max retries reached. Download failed.")
428
+ raise
429
+
430
+ def create_download_directory(title):
431
+ base_dir = "Results"
432
+ # Remove characters that are illegal in Windows filenames and normalize
433
+ safe_title = normalize_title(title)
434
+ logging.debug(f"{title} successfully normalized")
435
+ session_path = os.path.join(base_dir, safe_title)
436
+ if not os.path.exists(session_path):
437
+ os.makedirs(session_path, exist_ok=True)
438
+ logging.debug(f"Created directory for downloaded video: {session_path}")
439
+ else:
440
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
441
+ return session_path
442
+
443
+
444
+ def safe_read_file(file_path):
445
+ encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
446
+ for encoding in encodings:
447
+ try:
448
+ with open(file_path, 'r', encoding=encoding) as file:
449
+ return file.read()
450
+ except UnicodeDecodeError:
451
+ continue
452
+ except FileNotFoundError:
453
+ return f"File not found: {file_path}"
454
+ except Exception as e:
455
+ return f"An error occurred: {e}"
456
+ return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
457
+
458
+ #
459
+ # End of Files-saving Function Definitions
460
+ #######################################################################################################################
461
+
462
+
463
+ #######################################################################################################################
464
+ #
465
+ # UUID-Functions
466
+
467
+ def generate_unique_filename(base_path, base_filename):
468
+ """Generate a unique filename by appending a counter if necessary."""
469
+ filename = base_filename
470
+ counter = 1
471
+ while os.path.exists(os.path.join(base_path, filename)):
472
+ name, ext = os.path.splitext(base_filename)
473
+ filename = f"{name}_{counter}{ext}"
474
+ counter += 1
475
+ return filename
476
+
477
+
478
+ def generate_unique_identifier(file_path):
479
+ filename = os.path.basename(file_path)
480
+ timestamp = int(time.time())
481
+
482
+ # Generate a hash of the file content
483
+ hasher = hashlib.md5()
484
+ with open(file_path, 'rb') as f:
485
+ buf = f.read()
486
+ hasher.update(buf)
487
+ content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
488
+
489
+ return f"local:{timestamp}:{content_hash}:{filename}"
490
+
491
+ #
492
+ # End of UUID-Functions
493
+ #######################################################################################################################
494
+
495
+
496
+ #######################################################################################################################
497
+ #
498
+ # Backup code
499
+
500
+ #
501
+ # End of backup code
502
+ #######################################################################################################################
503
+
504
+
505
+ #######################################################################################################################
506
+ #
507
+ # Sanitization/Verification Functions
508
+
509
+ # Helper function to validate URL format
510
+ def is_valid_url(url: str) -> bool:
511
+ regex = re.compile(
512
+ r'^(?:http|ftp)s?://' # http:// or https://
513
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
514
+ r'localhost|' # localhost...
515
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
516
+ r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
517
+ r'(?::\d+)?' # optional port
518
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
519
+ return re.match(regex, url) is not None
520
+
521
+
522
+ def verify_checksum(file_path, expected_checksum):
523
+ sha256_hash = hashlib.sha256()
524
+ with open(file_path, 'rb') as f:
525
+ for byte_block in iter(lambda: f.read(4096), b''):
526
+ sha256_hash.update(byte_block)
527
+ return sha256_hash.hexdigest() == expected_checksum
528
+
529
+
530
+ def normalize_title(title):
531
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
532
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
533
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
534
+ '').replace(
535
+ '<', '').replace('>', '').replace('|', '')
536
+ return title
537
+
538
+
539
+ def clean_youtube_url(url):
540
+ parsed_url = urlparse(url)
541
+ query_params = parse_qs(parsed_url.query)
542
+ if 'list' in query_params:
543
+ query_params.pop('list')
544
+ cleaned_query = urlencode(query_params, doseq=True)
545
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
546
+ return cleaned_url
547
+
548
+ def sanitize_filename(filename):
549
+ # Remove invalid characters and replace spaces with underscores
550
+ sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
551
+ sanitized = re.sub(r'\s+', ' ', sanitized).strip()
552
+ return sanitized
553
+
554
+
555
+ def format_transcription(content):
556
+ # Replace '\n' with actual line breaks
557
+ content = content.replace('\\n', '\n')
558
+ # Split the content by newlines first
559
+ lines = content.split('\n')
560
+ formatted_lines = []
561
+ for line in lines:
562
+ # Add extra space after periods for better readability
563
+ line = line.replace('.', '. ').replace('. ', '. ')
564
+
565
+ # Split into sentences using a more comprehensive regex
566
+ sentences = re.split('(?<=[.!?]) +', line)
567
+
568
+ # Trim whitespace from each sentence and add a line break
569
+ formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
570
+
571
+ # Join the formatted sentences
572
+ formatted_lines.append(' '.join(formatted_sentences))
573
+
574
+ # Join the lines with HTML line breaks
575
+ formatted_content = '<br>'.join(formatted_lines)
576
+
577
+ return formatted_content
578
+
579
+
580
+ def format_file_path(file_path, fallback_path=None):
581
+ if file_path and os.path.exists(file_path):
582
+ logging.debug(f"File exists: {file_path}")
583
+ return file_path
584
+ elif fallback_path and os.path.exists(fallback_path):
585
+ logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
586
+ return fallback_path
587
+ else:
588
+ logging.debug(f"File does not exist: {file_path}. No fallback path available.")
589
+ return None
590
+
591
+ #
592
+ # End of Sanitization/Verification Functions
593
+ #######################################################################################################################
594
+
595
+
596
+ #######################################################################################################################
597
+ #
598
+ # DB Config Loading
599
+
600
+
601
+ def get_db_config():
602
+ config = configparser.ConfigParser()
603
+ config.read('config.txt')
604
+ return {
605
+ 'type': config['Database']['type'],
606
+ 'sqlite_path': config.get('Database', 'sqlite_path', fallback='media_summary.db'),
607
+ 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
608
+ 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
609
+ }
610
+
611
+
612
+ #
613
+ # End of DB Config Loading
614
+ #######################################################################################################################
App_Function_Libraries/Utils/__init__.py ADDED
File without changes