oceansweep commited on
Commit
4bcbef7
1 Parent(s): 99313c7

Delete App_Function_Libraries/Summarization_General_Lib.py

Browse files
App_Function_Libraries/Summarization_General_Lib.py DELETED
@@ -1,1580 +0,0 @@
1
- # Summarization_General_Lib.py
2
- #########################################
3
- # General Summarization Library
4
- # This library is used to perform summarization.
5
- #
6
- ####
7
- ####################
8
- # Function List
9
- #
10
- # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
- # 2. summarize_with_openai(api_key, file_path, custom_prompt_arg)
12
- # 3. summarize_with_anthropic(api_key, file_path, model, custom_prompt_arg, max_retries=3, retry_delay=5)
13
- # 4. summarize_with_cohere(api_key, file_path, model, custom_prompt_arg)
14
- # 5. summarize_with_groq(api_key, file_path, model, custom_prompt_arg)
15
- #
16
- #
17
- ####################
18
- # Import necessary libraries
19
- import json
20
- import logging
21
- import os
22
- import time
23
- from typing import Optional
24
-
25
- import requests
26
- from requests import RequestException
27
-
28
- from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav, speech_to_text
29
- from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summarize, recursive_summarize_chunks, \
30
- improved_chunking_process
31
- from App_Function_Libraries.Diarization_Lib import combine_transcription_and_diarization
32
- from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
33
- summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
34
- from App_Function_Libraries.DB.DB_Manager import add_media_to_database
35
- # Import Local
36
- from App_Function_Libraries.Utils.Utils import load_and_log_configs, load_comprehensive_config, sanitize_filename, \
37
- clean_youtube_url, create_download_directory, is_valid_url
38
- from App_Function_Libraries.Video_DL_Ingestion_Lib import download_video, extract_video_info
39
-
40
- #
41
- #######################################################################################################################
42
- # Function Definitions
43
- #
44
- config = load_comprehensive_config()
45
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
46
-
47
-
48
- def summarize(
49
- input_data: str,
50
- custom_prompt_arg: Optional[str],
51
- api_name: str,
52
- api_key: Optional[str],
53
- temp: Optional[float],
54
- system_message: Optional[str]
55
- ) -> str:
56
- try:
57
- logging.debug(f"api_name type: {type(api_name)}, value: {api_name}")
58
- if api_name.lower() == "openai":
59
- return summarize_with_openai(api_key, input_data, custom_prompt_arg, temp, system_message)
60
- elif api_name.lower() == "anthropic":
61
- return summarize_with_anthropic(api_key, input_data, custom_prompt_arg, temp, system_message)
62
- elif api_name.lower() == "cohere":
63
- return summarize_with_cohere(api_key, input_data, custom_prompt_arg, temp, system_message)
64
- elif api_name.lower() == "groq":
65
- return summarize_with_groq(api_key, input_data, custom_prompt_arg, temp, system_message)
66
- elif api_name.lower() == "huggingface":
67
- return summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp)
68
- elif api_name.lower() == "openrouter":
69
- return summarize_with_openrouter(api_key, input_data, custom_prompt_arg, temp, system_message)
70
- elif api_name.lower() == "deepseek":
71
- return summarize_with_deepseek(api_key, input_data, custom_prompt_arg, temp, system_message)
72
- elif api_name.lower() == "mistral":
73
- return summarize_with_mistral(api_key, input_data, custom_prompt_arg, temp, system_message)
74
- elif api_name.lower() == "llama.cpp":
75
- return summarize_with_llama(input_data, custom_prompt_arg, temp, system_message)
76
- elif api_name.lower() == "kobold":
77
- return summarize_with_kobold(input_data, api_key, custom_prompt_arg, temp, system_message)
78
- elif api_name.lower() == "ooba":
79
- return summarize_with_oobabooga(input_data, api_key, custom_prompt_arg, temp, system_message)
80
- elif api_name.lower() == "tabbyapi":
81
- return summarize_with_tabbyapi(input_data, custom_prompt_arg, temp, system_message)
82
- elif api_name.lower() == "vllm":
83
- return summarize_with_vllm(input_data, custom_prompt_arg, None, system_message)
84
- elif api_name.lower() == "local-llm":
85
- return summarize_with_local_llm(input_data, custom_prompt_arg, temp, system_message)
86
- elif api_name.lower() == "huggingface":
87
- return summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp, )#system_message)
88
- else:
89
- return f"Error: Invalid API Name {api_name}"
90
-
91
- except Exception as e:
92
- logging.error(f"Error in summarize function: {str(e)}", exc_info=True)
93
- return f"Error: {str(e)}"
94
-
95
-
96
- def extract_text_from_segments(segments):
97
- logging.debug(f"Segments received: {segments}")
98
- logging.debug(f"Type of segments: {type(segments)}")
99
-
100
- text = ""
101
-
102
- if isinstance(segments, list):
103
- for segment in segments:
104
- logging.debug(f"Current segment: {segment}")
105
- logging.debug(f"Type of segment: {type(segment)}")
106
- if 'Text' in segment:
107
- text += segment['Text'] + " "
108
- else:
109
- logging.warning(f"Skipping segment due to missing 'Text' key: {segment}")
110
- else:
111
- logging.warning(f"Unexpected type of 'segments': {type(segments)}")
112
-
113
- return text.strip()
114
-
115
-
116
- def summarize_with_openai(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
117
- loaded_config_data = load_and_log_configs()
118
- try:
119
- # API key validation
120
- if not api_key or api_key.strip() == "":
121
- logging.info("OpenAI: #1 API key not provided as parameter")
122
- logging.info("OpenAI: Attempting to use API key from config file")
123
- api_key = loaded_config_data['api_keys']['openai']
124
-
125
- if not api_key or api_key.strip() == "":
126
- logging.error("OpenAI: #2 API key not found or is empty")
127
- return "OpenAI: API Key Not Provided/Found in Config file or is empty"
128
-
129
- openai_api_key = api_key
130
- logging.debug(f"OpenAI: Using API Key: {api_key[:5]}...{api_key[-5:]}")
131
-
132
- # Input data handling
133
- logging.debug(f"OpenAI: Raw input data type: {type(input_data)}")
134
- logging.debug(f"OpenAI: Raw input data (first 500 chars): {str(input_data)[:500]}...")
135
-
136
- if isinstance(input_data, str):
137
- if input_data.strip().startswith('{'):
138
- # It's likely a JSON string
139
- logging.debug("OpenAI: Parsing provided JSON string data for summarization")
140
- try:
141
- data = json.loads(input_data)
142
- except json.JSONDecodeError as e:
143
- logging.error(f"OpenAI: Error parsing JSON string: {str(e)}")
144
- return f"OpenAI: Error parsing JSON input: {str(e)}"
145
- elif os.path.isfile(input_data):
146
- logging.debug("OpenAI: Loading JSON data from file for summarization")
147
- with open(input_data, 'r') as file:
148
- data = json.load(file)
149
- else:
150
- logging.debug("OpenAI: Using provided string data for summarization")
151
- data = input_data
152
- else:
153
- data = input_data
154
-
155
- logging.debug(f"OpenAI: Processed data type: {type(data)}")
156
- logging.debug(f"OpenAI: Processed data (first 500 chars): {str(data)[:500]}...")
157
-
158
- # Text extraction
159
- if isinstance(data, dict):
160
- if 'summary' in data:
161
- logging.debug("OpenAI: Summary already exists in the loaded data")
162
- return data['summary']
163
- elif 'segments' in data:
164
- text = extract_text_from_segments(data['segments'])
165
- else:
166
- text = json.dumps(data) # Convert dict to string if no specific format
167
- elif isinstance(data, list):
168
- text = extract_text_from_segments(data)
169
- elif isinstance(data, str):
170
- text = data
171
- else:
172
- raise ValueError(f"OpenAI: Invalid input data format: {type(data)}")
173
-
174
- logging.debug(f"OpenAI: Extracted text (first 500 chars): {text[:500]}...")
175
- logging.debug(f"OpenAI: Custom prompt: {custom_prompt_arg}")
176
-
177
- openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
178
- logging.debug(f"OpenAI: Using model: {openai_model}")
179
-
180
- headers = {
181
- 'Authorization': f'Bearer {openai_api_key}',
182
- 'Content-Type': 'application/json'
183
- }
184
-
185
- logging.debug(
186
- f"OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
187
- logging.debug("openai: Preparing data + prompt for submittal")
188
- openai_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
189
- if temp is None:
190
- temp = 0.7
191
- if system_message is None:
192
- system_message = "You are a helpful AI assistant who does whatever the user requests."
193
- temp = float(temp)
194
- data = {
195
- "model": openai_model,
196
- "messages": [
197
- {"role": "system", "content": system_message},
198
- {"role": "user", "content": openai_prompt}
199
- ],
200
- "max_tokens": 4096,
201
- "temperature": temp
202
- }
203
-
204
- logging.debug("OpenAI: Posting request")
205
- response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)
206
-
207
- if response.status_code == 200:
208
- response_data = response.json()
209
- if 'choices' in response_data and len(response_data['choices']) > 0:
210
- summary = response_data['choices'][0]['message']['content'].strip()
211
- logging.debug("OpenAI: Summarization successful")
212
- logging.debug(f"OpenAI: Summary (first 500 chars): {summary[:500]}...")
213
- return summary
214
- else:
215
- logging.warning("OpenAI: Summary not found in the response data")
216
- return "OpenAI: Summary not available"
217
- else:
218
- logging.error(f"OpenAI: Summarization failed with status code {response.status_code}")
219
- logging.error(f"OpenAI: Error response: {response.text}")
220
- return f"OpenAI: Failed to process summary. Status code: {response.status_code}"
221
- except json.JSONDecodeError as e:
222
- logging.error(f"OpenAI: Error decoding JSON: {str(e)}", exc_info=True)
223
- return f"OpenAI: Error decoding JSON input: {str(e)}"
224
- except requests.RequestException as e:
225
- logging.error(f"OpenAI: Error making API request: {str(e)}", exc_info=True)
226
- return f"OpenAI: Error making API request: {str(e)}"
227
- except Exception as e:
228
- logging.error(f"OpenAI: Unexpected error: {str(e)}", exc_info=True)
229
- return f"OpenAI: Unexpected error occurred: {str(e)}"
230
-
231
-
232
- def summarize_with_anthropic(api_key, input_data, custom_prompt_arg, temp=None, system_message=None, max_retries=3, retry_delay=5):
233
- logging.debug("Anthropic: Summarization process starting...")
234
- try:
235
- logging.debug("Anthropic: Loading and validating configurations")
236
- loaded_config_data = load_and_log_configs()
237
- if loaded_config_data is None:
238
- logging.error("Failed to load configuration data")
239
- anthropic_api_key = None
240
- else:
241
- # Prioritize the API key passed as a parameter
242
- if api_key and api_key.strip():
243
- anthropic_api_key = api_key
244
- logging.info("Anthropic: Using API key provided as parameter")
245
- else:
246
- # If no parameter is provided, use the key from the config
247
- anthropic_api_key = loaded_config_data['api_keys'].get('anthropic')
248
- if anthropic_api_key:
249
- logging.info("Anthropic: Using API key from config file")
250
- else:
251
- logging.warning("Anthropic: No API key found in config file")
252
-
253
- # Final check to ensure we have a valid API key
254
- if not anthropic_api_key or not anthropic_api_key.strip():
255
- logging.error("Anthropic: No valid API key available")
256
- # You might want to raise an exception here or handle this case as appropriate for your application
257
- #FIXME
258
- # For example: raise ValueError("No valid Anthropic API key available")
259
-
260
-
261
- logging.debug(f"Anthropic: Using API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:]}")
262
-
263
- if isinstance(input_data, str) and os.path.isfile(input_data):
264
- logging.debug("AnthropicAI: Loading json data for summarization")
265
- with open(input_data, 'r') as file:
266
- data = json.load(file)
267
- else:
268
- logging.debug("AnthropicAI: Using provided string data for summarization")
269
- data = input_data
270
-
271
- # DEBUG - Debug logging to identify sent data
272
- logging.debug(f"AnthropicAI: Loaded data: {data[:500]}...(snipped to first 500 chars)")
273
- logging.debug(f"AnthropicAI: Type of data: {type(data)}")
274
-
275
- if isinstance(data, dict) and 'summary' in data:
276
- # If the loaded data is a dictionary and already contains a summary, return it
277
- logging.debug("Anthropic: Summary already exists in the loaded data")
278
- return data['summary']
279
-
280
- # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
281
- if isinstance(data, list):
282
- segments = data
283
- text = extract_text_from_segments(segments)
284
- elif isinstance(data, str):
285
- text = data
286
- else:
287
- raise ValueError("Anthropic: Invalid input data format")
288
-
289
- if temp is None:
290
- temp = 0.1
291
- temp = float(temp)
292
-
293
- if system_message is None:
294
- system_message = "You are a helpful AI assistant who does whatever the user requests."
295
-
296
- headers = {
297
- 'x-api-key': anthropic_api_key,
298
- 'anthropic-version': '2023-06-01',
299
- 'Content-Type': 'application/json'
300
- }
301
-
302
- anthropic_prompt = custom_prompt_arg
303
- logging.debug(f"Anthropic: Prompt is {anthropic_prompt}")
304
- user_message = {
305
- "role": "user",
306
- "content": f"{text} \n\n\n\n{anthropic_prompt}"
307
- }
308
-
309
- model = loaded_config_data['models']['anthropic']
310
-
311
- data = {
312
- "model": model,
313
- "max_tokens": 4096, # max _possible_ tokens to return
314
- "messages": [user_message],
315
- "stop_sequences": ["\n\nHuman:"],
316
- "temperature": temp,
317
- "top_k": 0,
318
- "top_p": 1.0,
319
- "metadata": {
320
- "user_id": "example_user_id",
321
- },
322
- "stream": False,
323
- "system": system_message
324
- }
325
-
326
- for attempt in range(max_retries):
327
- try:
328
- logging.debug("anthropic: Posting request to API")
329
- response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
330
-
331
- # Check if the status code indicates success
332
- if response.status_code == 200:
333
- logging.debug("anthropic: Post submittal successful")
334
- response_data = response.json()
335
- try:
336
- summary = response_data['content'][0]['text'].strip()
337
- logging.debug("anthropic: Summarization successful")
338
- print("Summary processed successfully.")
339
- return summary
340
- except (IndexError, KeyError) as e:
341
- logging.debug("anthropic: Unexpected data in response")
342
- print("Unexpected response format from Anthropic API:", response.text)
343
- return None
344
- elif response.status_code == 500: # Handle internal server error specifically
345
- logging.debug("anthropic: Internal server error")
346
- print("Internal server error from API. Retrying may be necessary.")
347
- time.sleep(retry_delay)
348
- else:
349
- logging.debug(
350
- f"anthropic: Failed to summarize, status code {response.status_code}: {response.text}")
351
- print(f"Failed to process summary, status code {response.status_code}: {response.text}")
352
- return None
353
-
354
- except RequestException as e:
355
- logging.error(f"anthropic: Network error during attempt {attempt + 1}/{max_retries}: {str(e)}")
356
- if attempt < max_retries - 1:
357
- time.sleep(retry_delay)
358
- else:
359
- return f"anthropic: Network error: {str(e)}"
360
- except FileNotFoundError as e:
361
- logging.error(f"anthropic: File not found: {input_data}")
362
- return f"anthropic: File not found: {input_data}"
363
- except json.JSONDecodeError as e:
364
- logging.error(f"anthropic: Invalid JSON format in file: {input_data}")
365
- return f"anthropic: Invalid JSON format in file: {input_data}"
366
- except Exception as e:
367
- logging.error(f"anthropic: Error in processing: {str(e)}")
368
- return f"anthropic: Error occurred while processing summary with Anthropic: {str(e)}"
369
-
370
-
371
- # Summarize with Cohere
372
- def summarize_with_cohere(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
373
- logging.debug("Cohere: Summarization process starting...")
374
- try:
375
- logging.debug("Cohere: Loading and validating configurations")
376
- loaded_config_data = load_and_log_configs()
377
- if loaded_config_data is None:
378
- logging.error("Failed to load configuration data")
379
- cohere_api_key = None
380
- else:
381
- # Prioritize the API key passed as a parameter
382
- if api_key and api_key.strip():
383
- cohere_api_key = api_key
384
- logging.info("Cohere: Using API key provided as parameter")
385
- else:
386
- # If no parameter is provided, use the key from the config
387
- cohere_api_key = loaded_config_data['api_keys'].get('cohere')
388
- if cohere_api_key:
389
- logging.info("Cohere: Using API key from config file")
390
- else:
391
- logging.warning("Cohere: No API key found in config file")
392
-
393
- # Final check to ensure we have a valid API key
394
- if not cohere_api_key or not cohere_api_key.strip():
395
- logging.error("Cohere: No valid API key available")
396
- # You might want to raise an exception here or handle this case as appropriate for your application
397
- # FIXME
398
- # For example: raise ValueError("No valid Anthropic API key available")
399
-
400
- if custom_prompt_arg is None:
401
- custom_prompt_arg = ""
402
-
403
- if system_message is None:
404
- system_message = ""
405
-
406
- logging.debug(f"Cohere: Using API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:]}")
407
-
408
- if isinstance(input_data, str) and os.path.isfile(input_data):
409
- logging.debug("Cohere: Loading json data for summarization")
410
- with open(input_data, 'r') as file:
411
- data = json.load(file)
412
- else:
413
- logging.debug("Cohere: Using provided string data for summarization")
414
- data = input_data
415
-
416
- # DEBUG - Debug logging to identify sent data
417
- logging.debug(f"Cohere: Loaded data: {data[:500]}...(snipped to first 500 chars)")
418
- logging.debug(f"Cohere: Type of data: {type(data)}")
419
-
420
- if isinstance(data, dict) and 'summary' in data:
421
- # If the loaded data is a dictionary and already contains a summary, return it
422
- logging.debug("Cohere: Summary already exists in the loaded data")
423
- return data['summary']
424
-
425
- # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
426
- if isinstance(data, list):
427
- segments = data
428
- text = extract_text_from_segments(segments)
429
- elif isinstance(data, str):
430
- text = data
431
- else:
432
- raise ValueError("Invalid input data format")
433
-
434
- cohere_model = loaded_config_data['models']['cohere']
435
-
436
- if temp is None:
437
- temp = 0.3
438
- temp = float(temp)
439
- if system_message is None:
440
- system_message = "You are a helpful AI assistant who does whatever the user requests."
441
-
442
- headers = {
443
- 'accept': 'application/json',
444
- 'content-type': 'application/json',
445
- 'Authorization': f'Bearer {cohere_api_key}'
446
- }
447
-
448
- cohere_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
449
- logging.debug(f"cohere: Prompt being sent is {cohere_prompt}")
450
-
451
- data = {
452
- "preamble": system_message,
453
- "message": cohere_prompt,
454
- "model": cohere_model,
455
- # "connectors": [{"id": "web-search"}],
456
- "temperature": temp
457
- }
458
-
459
- logging.debug("cohere: Submitting request to API endpoint")
460
- response = requests.post('https://api.cohere.ai/v1/chat', headers=headers, json=data)
461
- response_data = response.json()
462
- logging.debug("API Response Data: %s", response_data)
463
-
464
- if response.status_code == 200:
465
- if 'text' in response_data:
466
- summary = response_data['text'].strip()
467
- logging.debug("cohere: Summarization successful")
468
- print("Summary processed successfully.")
469
- return summary
470
- else:
471
- logging.error("Expected data not found in API response.")
472
- return "Expected data not found in API response."
473
- else:
474
- logging.error(f"cohere: API request failed with status code {response.status_code}: {response.text}")
475
- print(f"Failed to process summary, status code {response.status_code}: {response.text}")
476
- return f"cohere: API request failed: {response.text}"
477
-
478
- except Exception as e:
479
- logging.error("cohere: Error in processing: %s", str(e))
480
- return f"cohere: Error occurred while processing summary with Cohere: {str(e)}"
481
-
482
-
483
- # https://console.groq.com/docs/quickstart
484
- def summarize_with_groq(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
485
- logging.debug("Groq: Summarization process starting...")
486
- try:
487
- logging.debug("Groq: Loading and validating configurations")
488
- loaded_config_data = load_and_log_configs()
489
- if loaded_config_data is None:
490
- logging.error("Failed to load configuration data")
491
- groq_api_key = None
492
- else:
493
- # Prioritize the API key passed as a parameter
494
- if api_key and api_key.strip():
495
- groq_api_key = api_key
496
- logging.info("Groq: Using API key provided as parameter")
497
- else:
498
- # If no parameter is provided, use the key from the config
499
- groq_api_key = loaded_config_data['api_keys'].get('groq')
500
- if groq_api_key:
501
- logging.info("Groq: Using API key from config file")
502
- else:
503
- logging.warning("Groq: No API key found in config file")
504
-
505
- # Final check to ensure we have a valid API key
506
- if not groq_api_key or not groq_api_key.strip():
507
- logging.error("Anthropic: No valid API key available")
508
- # You might want to raise an exception here or handle this case as appropriate for your application
509
- # FIXME
510
- # For example: raise ValueError("No valid Anthropic API key available")
511
-
512
- logging.debug(f"Groq: Using API Key: {groq_api_key[:5]}...{groq_api_key[-5:]}")
513
-
514
- # Transcript data handling & Validation
515
- if isinstance(input_data, str) and os.path.isfile(input_data):
516
- logging.debug("Groq: Loading json data for summarization")
517
- with open(input_data, 'r') as file:
518
- data = json.load(file)
519
- else:
520
- logging.debug("Groq: Using provided string data for summarization")
521
- data = input_data
522
-
523
- # DEBUG - Debug logging to identify sent data
524
- logging.debug(f"Groq: Loaded data: {data[:500]}...(snipped to first 500 chars)")
525
- logging.debug(f"Groq: Type of data: {type(data)}")
526
-
527
- if isinstance(data, dict) and 'summary' in data:
528
- # If the loaded data is a dictionary and already contains a summary, return it
529
- logging.debug("Groq: Summary already exists in the loaded data")
530
- return data['summary']
531
-
532
- # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
533
- if isinstance(data, list):
534
- segments = data
535
- text = extract_text_from_segments(segments)
536
- elif isinstance(data, str):
537
- text = data
538
- else:
539
- raise ValueError("Groq: Invalid input data format")
540
-
541
- # Set the model to be used
542
- groq_model = loaded_config_data['models']['groq']
543
-
544
- if temp is None:
545
- temp = 0.2
546
- temp = float(temp)
547
- if system_message is None:
548
- system_message = "You are a helpful AI assistant who does whatever the user requests."
549
-
550
- headers = {
551
- 'Authorization': f'Bearer {groq_api_key}',
552
- 'Content-Type': 'application/json'
553
- }
554
-
555
- groq_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
556
- logging.debug("groq: Prompt being sent is {groq_prompt}")
557
-
558
- data = {
559
- "messages": [
560
- {
561
- "role": "system",
562
- "content": system_message,
563
- },
564
- {
565
- "role": "user",
566
- "content": groq_prompt,
567
- }
568
- ],
569
- "model": groq_model,
570
- "temperature": temp
571
- }
572
-
573
- logging.debug("groq: Submitting request to API endpoint")
574
- print("groq: Submitting request to API endpoint")
575
- response = requests.post('https://api.groq.com/openai/v1/chat/completions', headers=headers, json=data)
576
-
577
- response_data = response.json()
578
- logging.debug("API Response Data: %s", response_data)
579
-
580
- if response.status_code == 200:
581
- if 'choices' in response_data and len(response_data['choices']) > 0:
582
- summary = response_data['choices'][0]['message']['content'].strip()
583
- logging.debug("groq: Summarization successful")
584
- print("Summarization successful.")
585
- return summary
586
- else:
587
- logging.error("Expected data not found in API response.")
588
- return "Expected data not found in API response."
589
- else:
590
- logging.error(f"groq: API request failed with status code {response.status_code}: {response.text}")
591
- return f"groq: API request failed: {response.text}"
592
-
593
- except Exception as e:
594
- logging.error("groq: Error in processing: %s", str(e))
595
- return f"groq: Error occurred while processing summary with groq: {str(e)}"
596
-
597
-
598
- def summarize_with_openrouter(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
599
- import requests
600
- import json
601
- global openrouter_model, openrouter_api_key
602
- try:
603
- logging.debug("OpenRouter: Loading and validating configurations")
604
- loaded_config_data = load_and_log_configs()
605
- if loaded_config_data is None:
606
- logging.error("Failed to load configuration data")
607
- openrouter_api_key = None
608
- else:
609
- # Prioritize the API key passed as a parameter
610
- if api_key and api_key.strip():
611
- openrouter_api_key = api_key
612
- logging.info("OpenRouter: Using API key provided as parameter")
613
- else:
614
- # If no parameter is provided, use the key from the config
615
- openrouter_api_key = loaded_config_data['api_keys'].get('openrouter')
616
- if openrouter_api_key:
617
- logging.info("OpenRouter: Using API key from config file")
618
- else:
619
- logging.warning("OpenRouter: No API key found in config file")
620
-
621
- # Model Selection validation
622
- logging.debug("OpenRouter: Validating model selection")
623
- loaded_config_data = load_and_log_configs()
624
- openrouter_model = loaded_config_data['models']['openrouter']
625
- logging.debug(f"OpenRouter: Using model from config file: {openrouter_model}")
626
-
627
- # Final check to ensure we have a valid API key
628
- if not openrouter_api_key or not openrouter_api_key.strip():
629
- logging.error("OpenRouter: No valid API key available")
630
- raise ValueError("No valid Anthropic API key available")
631
- except Exception as e:
632
- logging.error("OpenRouter: Error in processing: %s", str(e))
633
- return f"OpenRouter: Error occurred while processing config file with OpenRouter: {str(e)}"
634
-
635
- logging.debug(f"OpenRouter: Using API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:]}")
636
-
637
- logging.debug(f"OpenRouter: Using Model: {openrouter_model}")
638
-
639
- if isinstance(input_data, str) and os.path.isfile(input_data):
640
- logging.debug("OpenRouter: Loading json data for summarization")
641
- with open(input_data, 'r') as file:
642
- data = json.load(file)
643
- else:
644
- logging.debug("OpenRouter: Using provided string data for summarization")
645
- data = input_data
646
-
647
- # DEBUG - Debug logging to identify sent data
648
- logging.debug(f"OpenRouter: Loaded data: {data[:500]}...(snipped to first 500 chars)")
649
- logging.debug(f"OpenRouter: Type of data: {type(data)}")
650
-
651
- if isinstance(data, dict) and 'summary' in data:
652
- # If the loaded data is a dictionary and already contains a summary, return it
653
- logging.debug("OpenRouter: Summary already exists in the loaded data")
654
- return data['summary']
655
-
656
- # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
657
- if isinstance(data, list):
658
- segments = data
659
- text = extract_text_from_segments(segments)
660
- elif isinstance(data, str):
661
- text = data
662
- else:
663
- raise ValueError("OpenRouter: Invalid input data format")
664
-
665
- openrouter_prompt = f"{input_data} \n\n\n\n{custom_prompt_arg}"
666
-
667
- if temp is None:
668
- temp = 0.1
669
- temp = float(temp)
670
- if system_message is None:
671
- system_message = "You are a helpful AI assistant who does whatever the user requests."
672
-
673
- try:
674
- logging.debug("OpenRouter: Submitting request to API endpoint")
675
- print("OpenRouter: Submitting request to API endpoint")
676
- response = requests.post(
677
- url="https://openrouter.ai/api/v1/chat/completions",
678
- headers={
679
- "Authorization": f"Bearer {openrouter_api_key}",
680
- },
681
- data=json.dumps({
682
- "model": openrouter_model,
683
- "messages": [
684
- {"role": "system", "content": system_message},
685
- {"role": "user", "content": openrouter_prompt}
686
- ],
687
- "temperature": temp
688
- })
689
- )
690
-
691
- response_data = response.json()
692
- logging.debug("API Response Data: %s", response_data)
693
-
694
- if response.status_code == 200:
695
- if 'choices' in response_data and len(response_data['choices']) > 0:
696
- summary = response_data['choices'][0]['message']['content'].strip()
697
- logging.debug("openrouter: Summarization successful")
698
- print("openrouter: Summarization successful.")
699
- return summary
700
- else:
701
- logging.error("openrouter: Expected data not found in API response.")
702
- return "openrouter: Expected data not found in API response."
703
- else:
704
- logging.error(f"openrouter: API request failed with status code {response.status_code}: {response.text}")
705
- return f"openrouter: API request failed: {response.text}"
706
- except Exception as e:
707
- logging.error("openrouter: Error in processing: %s", str(e))
708
- return f"openrouter: Error occurred while processing summary with openrouter: {str(e)}"
709
-
710
-
711
- def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None):
712
- loaded_config_data = load_and_log_configs()
713
- global huggingface_api_key
714
- logging.debug("HuggingFace: Summarization process starting...")
715
- try:
716
- logging.debug("HuggingFace: Loading and validating configurations")
717
- loaded_config_data = load_and_log_configs()
718
- if loaded_config_data is None:
719
- logging.error("Failed to load configuration data")
720
- huggingface_api_key = None
721
- else:
722
- # Prioritize the API key passed as a parameter
723
- if api_key and api_key.strip():
724
- huggingface_api_key = api_key
725
- logging.info("HuggingFace: Using API key provided as parameter")
726
- else:
727
- # If no parameter is provided, use the key from the config
728
- huggingface_api_key = loaded_config_data['api_keys'].get('huggingface')
729
- if huggingface_api_key:
730
- logging.info("HuggingFace: Using API key from config file")
731
- else:
732
- logging.warning("HuggingFace: No API key found in config file")
733
-
734
- # Final check to ensure we have a valid API key
735
- if not huggingface_api_key or not huggingface_api_key.strip():
736
- logging.error("HuggingFace: No valid API key available")
737
- # You might want to raise an exception here or handle this case as appropriate for your application
738
- # FIXME
739
- # For example: raise ValueError("No valid Anthropic API key available")
740
-
741
-
742
- logging.debug(f"HuggingFace: Using API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
743
-
744
- if isinstance(input_data, str) and os.path.isfile(input_data):
745
- logging.debug("HuggingFace: Loading json data for summarization")
746
- with open(input_data, 'r') as file:
747
- data = json.load(file)
748
- else:
749
- logging.debug("HuggingFace: Using provided string data for summarization")
750
- data = input_data
751
-
752
- # DEBUG - Debug logging to identify sent data
753
- logging.debug(f"HuggingFace: Loaded data: {data[:500]}...(snipped to first 500 chars)")
754
- logging.debug(f"HuggingFace: Type of data: {type(data)}")
755
-
756
- if isinstance(data, dict) and 'summary' in data:
757
- # If the loaded data is a dictionary and already contains a summary, return it
758
- logging.debug("HuggingFace: Summary already exists in the loaded data")
759
- return data['summary']
760
-
761
- # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
762
- if isinstance(data, list):
763
- segments = data
764
- text = extract_text_from_segments(segments)
765
- elif isinstance(data, str):
766
- text = data
767
- else:
768
- raise ValueError("HuggingFace: Invalid input data format")
769
-
770
- headers = {
771
- "Authorization": f"Bearer {huggingface_api_key}"
772
- }
773
- huggingface_model = loaded_config_data['models']['huggingface']
774
- API_URL = f"https://api-inference.huggingface.co/models/{huggingface_model}"
775
- if temp is None:
776
- temp = 0.1
777
- temp = float(temp)
778
- huggingface_prompt = f"{text}\n\n\n\n{custom_prompt_arg}"
779
- logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
780
- data = {
781
- "inputs": text,
782
- "parameters": {"max_length": 512, "min_length": 100} # You can adjust max_length and min_length as needed
783
- }
784
-
785
- logging.debug("huggingface: Submitting request...")
786
- response = requests.post(API_URL, headers=headers, json=data)
787
-
788
- if response.status_code == 200:
789
- summary = response.json()[0]['generated_text'].strip()
790
- logging.debug("huggingface: Summarization successful")
791
- print("Summarization successful.")
792
- return summary
793
- else:
794
- logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
795
- return f"Failed to process summary, status code {response.status_code}: {response.text}"
796
-
797
- except Exception as e:
798
- logging.error("huggingface: Error in processing: %s", str(e))
799
- print(f"Error occurred while processing summary with huggingface: {str(e)}")
800
- return None
801
-
802
-
803
- def summarize_with_deepseek(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
804
- logging.debug("DeepSeek: Summarization process starting...")
805
- try:
806
- logging.debug("DeepSeek: Loading and validating configurations")
807
- loaded_config_data = load_and_log_configs()
808
- if loaded_config_data is None:
809
- logging.error("Failed to load configuration data")
810
- deepseek_api_key = None
811
- else:
812
- # Prioritize the API key passed as a parameter
813
- if api_key and api_key.strip():
814
- deepseek_api_key = api_key
815
- logging.info("DeepSeek: Using API key provided as parameter")
816
- else:
817
- # If no parameter is provided, use the key from the config
818
- deepseek_api_key = loaded_config_data['api_keys'].get('deepseek')
819
- if deepseek_api_key:
820
- logging.info("DeepSeek: Using API key from config file")
821
- else:
822
- logging.warning("DeepSeek: No API key found in config file")
823
-
824
- # Final check to ensure we have a valid API key
825
- if not deepseek_api_key or not deepseek_api_key.strip():
826
- logging.error("DeepSeek: No valid API key available")
827
- # You might want to raise an exception here or handle this case as appropriate for your application
828
- # FIXME
829
- # For example: raise ValueError("No valid deepseek API key available")
830
-
831
-
832
- logging.debug(f"DeepSeek: Using API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:]}")
833
-
834
- # Input data handling
835
- if isinstance(input_data, str) and os.path.isfile(input_data):
836
- logging.debug("DeepSeek: Loading json data for summarization")
837
- with open(input_data, 'r') as file:
838
- data = json.load(file)
839
- else:
840
- logging.debug("DeepSeek: Using provided string data for summarization")
841
- data = input_data
842
-
843
- # DEBUG - Debug logging to identify sent data
844
- logging.debug(f"DeepSeek: Loaded data: {data[:500]}...(snipped to first 500 chars)")
845
- logging.debug(f"DeepSeek: Type of data: {type(data)}")
846
-
847
- if isinstance(data, dict) and 'summary' in data:
848
- # If the loaded data is a dictionary and already contains a summary, return it
849
- logging.debug("DeepSeek: Summary already exists in the loaded data")
850
- return data['summary']
851
-
852
- # Text extraction
853
- if isinstance(data, list):
854
- segments = data
855
- text = extract_text_from_segments(segments)
856
- elif isinstance(data, str):
857
- text = data
858
- else:
859
- raise ValueError("DeepSeek: Invalid input data format")
860
-
861
- deepseek_model = loaded_config_data['models']['deepseek'] or "deepseek-chat"
862
-
863
- if temp is None:
864
- temp = 0.1
865
- temp = float(temp)
866
- if system_message is None:
867
- system_message = "You are a helpful AI assistant who does whatever the user requests."
868
-
869
- headers = {
870
- 'Authorization': f'Bearer {api_key}',
871
- 'Content-Type': 'application/json'
872
- }
873
-
874
- logging.debug(
875
- f"Deepseek API Key: {api_key[:5]}...{api_key[-5:] if api_key else None}")
876
- logging.debug("openai: Preparing data + prompt for submittal")
877
- deepseek_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
878
- data = {
879
- "model": deepseek_model,
880
- "messages": [
881
- {"role": "system", "content": system_message},
882
- {"role": "user", "content": deepseek_prompt}
883
- ],
884
- "stream": False,
885
- "temperature": temp
886
- }
887
-
888
- logging.debug("DeepSeek: Posting request")
889
- response = requests.post('https://api.deepseek.com/chat/completions', headers=headers, json=data)
890
-
891
- if response.status_code == 200:
892
- response_data = response.json()
893
- if 'choices' in response_data and len(response_data['choices']) > 0:
894
- summary = response_data['choices'][0]['message']['content'].strip()
895
- logging.debug("DeepSeek: Summarization successful")
896
- return summary
897
- else:
898
- logging.warning("DeepSeek: Summary not found in the response data")
899
- return "DeepSeek: Summary not available"
900
- else:
901
- logging.error(f"DeepSeek: Summarization failed with status code {response.status_code}")
902
- logging.error(f"DeepSeek: Error response: {response.text}")
903
- return f"DeepSeek: Failed to process summary. Status code: {response.status_code}"
904
- except Exception as e:
905
- logging.error(f"DeepSeek: Error in processing: {str(e)}", exc_info=True)
906
- return f"DeepSeek: Error occurred while processing summary: {str(e)}"
907
-
908
-
909
- def summarize_with_mistral(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
910
- logging.debug("Mistral: Summarization process starting...")
911
- try:
912
- logging.debug("Mistral: Loading and validating configurations")
913
- loaded_config_data = load_and_log_configs()
914
- if loaded_config_data is None:
915
- logging.error("Failed to load configuration data")
916
- mistral_api_key = None
917
- else:
918
- # Prioritize the API key passed as a parameter
919
- if api_key and api_key.strip():
920
- mistral_api_key = api_key
921
- logging.info("Mistral: Using API key provided as parameter")
922
- else:
923
- # If no parameter is provided, use the key from the config
924
- mistral_api_key = loaded_config_data['api_keys'].get('mistral')
925
- if mistral_api_key:
926
- logging.info("Mistral: Using API key from config file")
927
- else:
928
- logging.warning("Mistral: No API key found in config file")
929
-
930
- # Final check to ensure we have a valid API key
931
- if not mistral_api_key or not mistral_api_key.strip():
932
- logging.error("Mistral: No valid API key available")
933
- # You might want to raise an exception here or handle this case as appropriate for your application
934
- # FIXME
935
- # For example: raise ValueError("No valid deepseek API key available")
936
-
937
-
938
- logging.debug(f"Mistral: Using API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:]}")
939
-
940
- # Input data handling
941
- if isinstance(input_data, str) and os.path.isfile(input_data):
942
- logging.debug("Mistral: Loading json data for summarization")
943
- with open(input_data, 'r') as file:
944
- data = json.load(file)
945
- else:
946
- logging.debug("Mistral: Using provided string data for summarization")
947
- data = input_data
948
-
949
- # DEBUG - Debug logging to identify sent data
950
- logging.debug(f"Mistral: Loaded data: {data[:500]}...(snipped to first 500 chars)")
951
- logging.debug(f"Mistral: Type of data: {type(data)}")
952
-
953
- if isinstance(data, dict) and 'summary' in data:
954
- # If the loaded data is a dictionary and already contains a summary, return it
955
- logging.debug("Mistral: Summary already exists in the loaded data")
956
- return data['summary']
957
-
958
- # Text extraction
959
- if isinstance(data, list):
960
- segments = data
961
- text = extract_text_from_segments(segments)
962
- elif isinstance(data, str):
963
- text = data
964
- else:
965
- raise ValueError("Mistral: Invalid input data format")
966
-
967
- mistral_model = loaded_config_data['models']['mistral'] or "mistral-large-latest"
968
-
969
- if temp is None:
970
- temp = 0.2
971
- temp = float(temp)
972
- if system_message is None:
973
- system_message = "You are a helpful AI assistant who does whatever the user requests."
974
-
975
- headers = {
976
- 'Authorization': f'Bearer {mistral_api_key}',
977
- 'Content-Type': 'application/json'
978
- }
979
-
980
- logging.debug(
981
- f"Deepseek API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
982
- logging.debug("Mistral: Preparing data + prompt for submittal")
983
- mistral_prompt = f"{custom_prompt_arg}\n\n\n\n{text} "
984
- data = {
985
- "model": mistral_model,
986
- "messages": [
987
- {"role": "system",
988
- "content": system_message},
989
- {"role": "user",
990
- "content": mistral_prompt}
991
- ],
992
- "temperature": temp,
993
- "top_p": 1,
994
- "max_tokens": 4096,
995
- "stream": "false",
996
- "safe_prompt": "false"
997
- }
998
-
999
- logging.debug("Mistral: Posting request")
1000
- response = requests.post('https://api.mistral.ai/v1/chat/completions', headers=headers, json=data)
1001
-
1002
- if response.status_code == 200:
1003
- response_data = response.json()
1004
- if 'choices' in response_data and len(response_data['choices']) > 0:
1005
- summary = response_data['choices'][0]['message']['content'].strip()
1006
- logging.debug("Mistral: Summarization successful")
1007
- return summary
1008
- else:
1009
- logging.warning("Mistral: Summary not found in the response data")
1010
- return "Mistral: Summary not available"
1011
- else:
1012
- logging.error(f"Mistral: Summarization failed with status code {response.status_code}")
1013
- logging.error(f"Mistral: Error response: {response.text}")
1014
- return f"Mistral: Failed to process summary. Status code: {response.status_code}"
1015
- except Exception as e:
1016
- logging.error(f"Mistral: Error in processing: {str(e)}", exc_info=True)
1017
- return f"Mistral: Error occurred while processing summary: {str(e)}"
1018
-
1019
- #
1020
- #
1021
- #######################################################################################################################
1022
- #
1023
- #
1024
- # Gradio File Processing
1025
-
1026
-
1027
- # Handle multiple videos as input
1028
- def process_video_urls(url_list, num_speakers, whisper_model, custom_prompt_input, offset, api_name, api_key, vad_filter,
1029
- download_video_flag, download_audio, rolling_summarization, detail_level, question_box,
1030
- keywords, chunk_text_by_words, max_words, chunk_text_by_sentences, max_sentences,
1031
- chunk_text_by_paragraphs, max_paragraphs, chunk_text_by_tokens, max_tokens, chunk_by_semantic,
1032
- semantic_chunk_size, semantic_chunk_overlap, recursive_summarization):
1033
- global current_progress
1034
- progress = [] # This must always be a list
1035
- status = [] # This must always be a list
1036
-
1037
- if custom_prompt_input is None:
1038
- custom_prompt_input = """
1039
- You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
1040
- **Bulleted Note Creation Guidelines**
1041
-
1042
- **Headings**:
1043
- - Based on referenced topics, not categories like quotes or terms
1044
- - Surrounded by **bold** formatting
1045
- - Not listed as bullet points
1046
- - No space between headings and list items underneath
1047
-
1048
- **Emphasis**:
1049
- - **Important terms** set in bold font
1050
- - **Text ending in a colon**: also bolded
1051
-
1052
- **Review**:
1053
- - Ensure adherence to specified format
1054
- - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
1055
-
1056
- def update_progress(index, url, message):
1057
- progress.append(f"Processing {index + 1}/{len(url_list)}: {url}") # Append to list
1058
- status.append(message) # Append to list
1059
- return "\n".join(progress), "\n".join(status) # Return strings for display
1060
-
1061
-
1062
- for index, url in enumerate(url_list):
1063
- try:
1064
- logging.info(f"Starting to process video {index + 1}/{len(url_list)}: {url}")
1065
- transcription, summary, json_file_path, summary_file_path, _, _ = process_url(url=url,
1066
- num_speakers=num_speakers,
1067
- whisper_model=whisper_model,
1068
- custom_prompt_input=custom_prompt_input,
1069
- offset=offset,
1070
- api_name=api_name,
1071
- api_key=api_key,
1072
- vad_filter=vad_filter,
1073
- download_video_flag=download_video_flag,
1074
- download_audio=download_audio,
1075
- rolling_summarization=rolling_summarization,
1076
- detail_level=detail_level,
1077
- question_box=question_box,
1078
- keywords=keywords,
1079
- chunk_text_by_words=chunk_text_by_words,
1080
- max_words=max_words,
1081
- chunk_text_by_sentences=chunk_text_by_sentences,
1082
- max_sentences=max_sentences,
1083
- chunk_text_by_paragraphs=chunk_text_by_paragraphs,
1084
- max_paragraphs=max_paragraphs,
1085
- chunk_text_by_tokens=chunk_text_by_tokens,
1086
- max_tokens=max_tokens,
1087
- chunk_by_semantic=chunk_by_semantic,
1088
- semantic_chunk_size=semantic_chunk_size,
1089
- semantic_chunk_overlap=semantic_chunk_overlap,
1090
- recursive_summarization=recursive_summarization)
1091
- # Update progress and transcription properly
1092
-
1093
- current_progress, current_status = update_progress(index, url, "Video processed and ingested into the database.")
1094
- logging.info(f"Successfully processed video {index + 1}/{len(url_list)}: {url}")
1095
-
1096
- time.sleep(1)
1097
- except Exception as e:
1098
- logging.error(f"Error processing video {index + 1}/{len(url_list)}: {url}")
1099
- logging.error(f"Error details: {str(e)}")
1100
- current_progress, current_status = update_progress(index, url, f"Error: {str(e)}")
1101
-
1102
- yield current_progress, current_status, None, None, None, None
1103
-
1104
- success_message = "All videos have been transcribed, summarized, and ingested into the database successfully."
1105
- return current_progress, success_message, None, None, None, None
1106
-
1107
-
1108
-
1109
- def perform_transcription(video_path, offset, whisper_model, vad_filter, diarize=False):
1110
- global segments_json_path
1111
- audio_file_path = convert_to_wav(video_path, offset)
1112
- segments_json_path = audio_file_path.replace('.wav', '.segments.json')
1113
-
1114
- if diarize:
1115
- diarized_json_path = audio_file_path.replace('.wav', '.diarized.json')
1116
-
1117
- # Check if diarized JSON already exists
1118
- if os.path.exists(diarized_json_path):
1119
- logging.info(f"Diarized file already exists: {diarized_json_path}")
1120
- try:
1121
- with open(diarized_json_path, 'r') as file:
1122
- diarized_segments = json.load(file)
1123
- if not diarized_segments:
1124
- logging.warning(f"Diarized JSON file is empty, re-generating: {diarized_json_path}")
1125
- raise ValueError("Empty diarized JSON file")
1126
- logging.debug(f"Loaded diarized segments from {diarized_json_path}")
1127
- return audio_file_path, diarized_segments
1128
- except (json.JSONDecodeError, ValueError) as e:
1129
- logging.error(f"Failed to read or parse the diarized JSON file: {e}")
1130
- os.remove(diarized_json_path)
1131
-
1132
- # If diarized file doesn't exist or was corrupted, generate new diarized transcription
1133
- logging.info(f"Generating diarized transcription for {audio_file_path}")
1134
- diarized_segments = combine_transcription_and_diarization(audio_file_path)
1135
-
1136
- # Save diarized segments
1137
- with open(diarized_json_path, 'w') as file:
1138
- json.dump(diarized_segments, file, indent=2)
1139
-
1140
- return audio_file_path, diarized_segments
1141
-
1142
- # Non-diarized transcription (existing functionality)
1143
- if os.path.exists(segments_json_path):
1144
- logging.info(f"Segments file already exists: {segments_json_path}")
1145
- try:
1146
- with open(segments_json_path, 'r') as file:
1147
- segments = json.load(file)
1148
- if not segments:
1149
- logging.warning(f"Segments JSON file is empty, re-generating: {segments_json_path}")
1150
- raise ValueError("Empty segments JSON file")
1151
- logging.debug(f"Loaded segments from {segments_json_path}")
1152
- except (json.JSONDecodeError, ValueError) as e:
1153
- logging.error(f"Failed to read or parse the segments JSON file: {e}")
1154
- os.remove(segments_json_path)
1155
- logging.info(f"Re-generating transcription for {audio_file_path}")
1156
- audio_file, segments = re_generate_transcription(audio_file_path, whisper_model, vad_filter)
1157
- if segments is None:
1158
- return None, None
1159
- else:
1160
- audio_file, segments = re_generate_transcription(audio_file_path, whisper_model, vad_filter)
1161
-
1162
- return audio_file_path, segments
1163
-
1164
-
1165
- def re_generate_transcription(audio_file_path, whisper_model, vad_filter):
1166
- try:
1167
- segments = speech_to_text(audio_file_path, whisper_model=whisper_model, vad_filter=vad_filter)
1168
- # Save segments to JSON
1169
- with open(segments_json_path, 'w') as file:
1170
- json.dump(segments, file, indent=2)
1171
- logging.debug(f"Transcription segments saved to {segments_json_path}")
1172
- return audio_file_path, segments
1173
- except Exception as e:
1174
- logging.error(f"Error in re-generating transcription: {str(e)}")
1175
- return None, None
1176
-
1177
-
1178
- def save_transcription_and_summary(transcription_text, summary_text, download_path, info_dict):
1179
- try:
1180
- video_title = sanitize_filename(info_dict.get('title', 'Untitled'))
1181
-
1182
- # Save transcription
1183
- transcription_file_path = os.path.join(download_path, f"{video_title}_transcription.txt")
1184
- with open(transcription_file_path, 'w', encoding='utf-8') as f:
1185
- f.write(transcription_text)
1186
-
1187
- # Save summary if available
1188
- summary_file_path = None
1189
- if summary_text:
1190
- summary_file_path = os.path.join(download_path, f"{video_title}_summary.txt")
1191
- with open(summary_file_path, 'w', encoding='utf-8') as f:
1192
- f.write(summary_text)
1193
-
1194
- return transcription_file_path, summary_file_path
1195
- except Exception as e:
1196
- logging.error(f"Error in save_transcription_and_summary: {str(e)}", exc_info=True)
1197
- return None, None
1198
-
1199
-
1200
- def summarize_chunk(api_name, text, custom_prompt_input, api_key, temp=None, system_message=None):
1201
- logging.debug("Entered 'summarize_chunk' function")
1202
- try:
1203
- result = summarize(text, custom_prompt_input, api_name, api_key, temp, system_message)
1204
- if result is None or result.startswith("Error:"):
1205
- logging.warning(f"Summarization with {api_name} failed: {result}")
1206
- return None
1207
- logging.info(f"Summarization with {api_name} successful")
1208
- return result
1209
- except Exception as e:
1210
- logging.error(f"Error in summarize_chunk with {api_name}: {str(e)}", exc_info=True)
1211
- return None
1212
-
1213
-
1214
- def extract_metadata_and_content(input_data):
1215
- metadata = {}
1216
- content = ""
1217
-
1218
- if isinstance(input_data, str):
1219
- if os.path.exists(input_data):
1220
- with open(input_data, 'r', encoding='utf-8') as file:
1221
- data = json.load(file)
1222
- else:
1223
- try:
1224
- data = json.loads(input_data)
1225
- except json.JSONDecodeError:
1226
- return {}, input_data
1227
- elif isinstance(input_data, dict):
1228
- data = input_data
1229
- else:
1230
- return {}, str(input_data)
1231
-
1232
- # Extract metadata
1233
- metadata['title'] = data.get('title', 'No title available')
1234
- metadata['author'] = data.get('author', 'Unknown author')
1235
-
1236
- # Extract content
1237
- if 'transcription' in data:
1238
- content = extract_text_from_segments(data['transcription'])
1239
- elif 'segments' in data:
1240
- content = extract_text_from_segments(data['segments'])
1241
- elif 'content' in data:
1242
- content = data['content']
1243
- else:
1244
- content = json.dumps(data)
1245
-
1246
- return metadata, content
1247
-
1248
-
1249
- def format_input_with_metadata(metadata, content):
1250
- formatted_input = f"Title: {metadata.get('title', 'No title available')}\n"
1251
- formatted_input += f"Author: {metadata.get('author', 'Unknown author')}\n\n"
1252
- formatted_input += content
1253
- return formatted_input
1254
-
1255
- def perform_summarization(api_name, input_data, custom_prompt_input, api_key, recursive_summarization=False, temp=None, system_message=None):
1256
- loaded_config_data = load_and_log_configs()
1257
- logging.info("Starting summarization process...")
1258
- if system_message is None:
1259
- system_message = """
1260
- You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
1261
- **Bulleted Note Creation Guidelines**
1262
-
1263
- **Headings**:
1264
- - Based on referenced topics, not categories like quotes or terms
1265
- - Surrounded by **bold** formatting
1266
- - Not listed as bullet points
1267
- - No space between headings and list items underneath
1268
-
1269
- **Emphasis**:
1270
- - **Important terms** set in bold font
1271
- - **Text ending in a colon**: also bolded
1272
-
1273
- **Review**:
1274
- - Ensure adherence to specified format
1275
- - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
1276
-
1277
- try:
1278
- logging.debug(f"Input data type: {type(input_data)}")
1279
- logging.debug(f"Input data (first 500 chars): {str(input_data)[:500]}...")
1280
-
1281
- # Extract metadata and content
1282
- metadata, content = extract_metadata_and_content(input_data)
1283
-
1284
- logging.debug(f"Extracted metadata: {metadata}")
1285
- logging.debug(f"Extracted content (first 500 chars): {content[:500]}...")
1286
-
1287
- # Prepare a structured input for summarization
1288
- structured_input = format_input_with_metadata(metadata, content)
1289
-
1290
- # Perform summarization on the structured input
1291
- if recursive_summarization:
1292
- chunk_options = {
1293
- 'method': 'words', # or 'sentences', 'paragraphs', 'tokens' based on your preference
1294
- 'max_size': 1000, # adjust as needed
1295
- 'overlap': 100, # adjust as needed
1296
- 'adaptive': False,
1297
- 'multi_level': False,
1298
- 'language': 'english'
1299
- }
1300
- chunks = improved_chunking_process(structured_input, chunk_options)
1301
- logging.debug(f"Chunking process completed. Number of chunks: {len(chunks)}")
1302
- logging.debug("Now performing recursive summarization on each chunk...")
1303
- logging.debug("summary = recursive_summarize_chunks")
1304
- summary = recursive_summarize_chunks([chunk['text'] for chunk in chunks],
1305
- lambda x: summarize_chunk(api_name, x, custom_prompt_input, api_key),
1306
- custom_prompt_input, temp, system_message)
1307
- else:
1308
- logging.debug("summary = summarize_chunk")
1309
- summary = summarize_chunk(api_name, structured_input, custom_prompt_input, api_key, temp, system_message)
1310
-
1311
- # add some actual validation logic
1312
- if summary is not None:
1313
- logging.info(f"Summary generated using {api_name} API")
1314
- if isinstance(input_data, str) and os.path.exists(input_data):
1315
- summary_file_path = input_data.replace('.json', '_summary.txt')
1316
- with open(summary_file_path, 'w', encoding='utf-8') as file:
1317
- file.write(summary)
1318
- else:
1319
- logging.warning(f"Failed to generate summary using {api_name} API")
1320
-
1321
- logging.info("Summarization completed successfully.")
1322
-
1323
- return summary
1324
-
1325
- except requests.exceptions.ConnectionError:
1326
- logging.error("Connection error while summarizing")
1327
- except Exception as e:
1328
- logging.error(f"Error summarizing with {api_name}: {str(e)}", exc_info=True)
1329
- return f"An error occurred during summarization: {str(e)}"
1330
- return None
1331
-
1332
- def extract_text_from_input(input_data):
1333
- if isinstance(input_data, str):
1334
- try:
1335
- # Try to parse as JSON
1336
- data = json.loads(input_data)
1337
- except json.JSONDecodeError:
1338
- # If not valid JSON, treat as plain text
1339
- return input_data
1340
- elif isinstance(input_data, dict):
1341
- data = input_data
1342
- else:
1343
- return str(input_data)
1344
-
1345
- # Extract relevant fields from the JSON object
1346
- text_parts = []
1347
- if 'title' in data:
1348
- text_parts.append(f"Title: {data['title']}")
1349
- if 'description' in data:
1350
- text_parts.append(f"Description: {data['description']}")
1351
- if 'transcription' in data:
1352
- if isinstance(data['transcription'], list):
1353
- transcription_text = ' '.join([segment.get('Text', '') for segment in data['transcription']])
1354
- elif isinstance(data['transcription'], str):
1355
- transcription_text = data['transcription']
1356
- else:
1357
- transcription_text = str(data['transcription'])
1358
- text_parts.append(f"Transcription: {transcription_text}")
1359
- elif 'segments' in data:
1360
- segments_text = extract_text_from_segments(data['segments'])
1361
- text_parts.append(f"Segments: {segments_text}")
1362
-
1363
- return '\n\n'.join(text_parts)
1364
-
1365
-
1366
-
1367
- def process_url(
1368
- url,
1369
- num_speakers,
1370
- whisper_model,
1371
- custom_prompt_input,
1372
- offset,
1373
- api_name,
1374
- api_key,
1375
- vad_filter,
1376
- download_video_flag,
1377
- download_audio,
1378
- rolling_summarization,
1379
- detail_level,
1380
- # It's for the asking a question about a returned prompt - needs to be removed #FIXME
1381
- question_box,
1382
- keywords,
1383
- chunk_text_by_words,
1384
- max_words,
1385
- chunk_text_by_sentences,
1386
- max_sentences,
1387
- chunk_text_by_paragraphs,
1388
- max_paragraphs,
1389
- chunk_text_by_tokens,
1390
- max_tokens,
1391
- chunk_by_semantic,
1392
- semantic_chunk_size,
1393
- semantic_chunk_overlap,
1394
- local_file_path=None,
1395
- diarize=False,
1396
- recursive_summarization=False,
1397
- temp=None,
1398
- system_message=None):
1399
- # Handle the chunk summarization options
1400
- set_chunk_txt_by_words = chunk_text_by_words
1401
- set_max_txt_chunk_words = max_words
1402
- set_chunk_txt_by_sentences = chunk_text_by_sentences
1403
- set_max_txt_chunk_sentences = max_sentences
1404
- set_chunk_txt_by_paragraphs = chunk_text_by_paragraphs
1405
- set_max_txt_chunk_paragraphs = max_paragraphs
1406
- set_chunk_txt_by_tokens = chunk_text_by_tokens
1407
- set_max_txt_chunk_tokens = max_tokens
1408
- set_chunk_txt_by_semantic = chunk_by_semantic
1409
- set_semantic_chunk_size = semantic_chunk_size
1410
- set_semantic_chunk_overlap = semantic_chunk_overlap
1411
-
1412
- progress = []
1413
- success_message = "All videos processed successfully. Transcriptions and summaries have been ingested into the database."
1414
-
1415
- # Validate input
1416
- if not url and not local_file_path:
1417
- return "Process_URL: No URL provided.", "No URL provided.", None, None, None, None, None, None
1418
-
1419
- if isinstance(url, str):
1420
- urls = url.strip().split('\n')
1421
- if len(urls) > 1:
1422
- return process_video_urls(urls, num_speakers, whisper_model, custom_prompt_input, offset, api_name, api_key, vad_filter,
1423
- download_video_flag, download_audio, rolling_summarization, detail_level, question_box,
1424
- keywords, chunk_text_by_words, max_words, chunk_text_by_sentences, max_sentences,
1425
- chunk_text_by_paragraphs, max_paragraphs, chunk_text_by_tokens, max_tokens, chunk_by_semantic, semantic_chunk_size, semantic_chunk_overlap, recursive_summarization)
1426
- else:
1427
- urls = [url]
1428
-
1429
- if url and not is_valid_url(url):
1430
- return "Process_URL: Invalid URL format.", "Invalid URL format.", None, None, None, None, None, None
1431
-
1432
- if url:
1433
- # Clean the URL to remove playlist parameters if any
1434
- url = clean_youtube_url(url)
1435
- logging.info(f"Process_URL: Processing URL: {url}")
1436
-
1437
- if api_name:
1438
- print("Process_URL: API Name received:", api_name) # Debugging line
1439
-
1440
- video_file_path = None
1441
- global info_dict
1442
-
1443
- # If URL/Local video file is provided
1444
- try:
1445
- info_dict, title = extract_video_info(url)
1446
- download_path = create_download_directory(title)
1447
- current_whsiper_model = whisper_model
1448
- video_path = download_video(url, download_path, info_dict, download_video_flag, current_whsiper_model)
1449
- global segments
1450
- audio_file_path, segments = perform_transcription(video_path, offset, whisper_model, vad_filter)
1451
-
1452
- if diarize:
1453
- transcription_text = combine_transcription_and_diarization(audio_file_path)
1454
- else:
1455
- audio_file, segments = perform_transcription(video_path, offset, whisper_model, vad_filter)
1456
- transcription_text = {'audio_file': audio_file, 'transcription': segments}
1457
-
1458
-
1459
- if audio_file_path is None or segments is None:
1460
- logging.error("Process_URL: Transcription failed or segments not available.")
1461
- return "Process_URL: Transcription failed.", "Transcription failed.", None, None, None, None
1462
-
1463
- logging.debug(f"Process_URL: Transcription audio_file: {audio_file_path}")
1464
- logging.debug(f"Process_URL: Transcription segments: {segments}")
1465
-
1466
- logging.debug(f"Process_URL: Transcription text: {transcription_text}")
1467
-
1468
- # FIXME - Implement chunking calls here
1469
- # Implement chunking calls here
1470
- chunked_transcriptions = []
1471
- if chunk_text_by_words:
1472
- chunked_transcriptions = chunk_text_by_words(transcription_text['transcription'], max_words)
1473
- elif chunk_text_by_sentences:
1474
- chunked_transcriptions = chunk_text_by_sentences(transcription_text['transcription'], max_sentences)
1475
- elif chunk_text_by_paragraphs:
1476
- chunked_transcriptions = chunk_text_by_paragraphs(transcription_text['transcription'], max_paragraphs)
1477
- elif chunk_text_by_tokens:
1478
- chunked_transcriptions = chunk_text_by_tokens(transcription_text['transcription'], max_tokens)
1479
- elif chunk_by_semantic:
1480
- chunked_transcriptions = semantic_chunking(transcription_text['transcription'], semantic_chunk_size, 'tokens')
1481
-
1482
- # If we did chunking, we now have the chunked transcripts in 'chunked_transcriptions'
1483
- elif rolling_summarization:
1484
- # FIXME - rolling summarization
1485
- # text = extract_text_from_segments(segments)
1486
- # summary_text = rolling_summarize_function(
1487
- # transcription_text,
1488
- # detail=detail_level,
1489
- # api_name=api_name,
1490
- # api_key=api_key,
1491
- # custom_prompt_input=custom_prompt_input,
1492
- # chunk_by_words=chunk_text_by_words,
1493
- # max_words=max_words,
1494
- # chunk_by_sentences=chunk_text_by_sentences,
1495
- # max_sentences=max_sentences,
1496
- # chunk_by_paragraphs=chunk_text_by_paragraphs,
1497
- # max_paragraphs=max_paragraphs,
1498
- # chunk_by_tokens=chunk_text_by_tokens,
1499
- # max_tokens=max_tokens
1500
- # )
1501
- pass
1502
- else:
1503
- pass
1504
-
1505
- summarized_chunk_transcriptions = []
1506
-
1507
- if chunk_text_by_words or chunk_text_by_sentences or chunk_text_by_paragraphs or chunk_text_by_tokens or chunk_by_semantic and api_name:
1508
- # Perform summarization based on chunks
1509
- for chunk in chunked_transcriptions:
1510
- summarized_chunks = []
1511
- if api_name == "anthropic":
1512
- summary = summarize_with_anthropic(api_key, chunk, custom_prompt_input)
1513
- elif api_name == "cohere":
1514
- summary = summarize_with_cohere(api_key, chunk, custom_prompt_input, temp, system_message)
1515
- elif api_name == "openai":
1516
- summary = summarize_with_openai(api_key, chunk, custom_prompt_input, temp, system_message)
1517
- elif api_name == "Groq":
1518
- summary = summarize_with_groq(api_key, chunk, custom_prompt_input, temp, system_message)
1519
- elif api_name == "DeepSeek":
1520
- summary = summarize_with_deepseek(api_key, chunk, custom_prompt_input, temp, system_message)
1521
- elif api_name == "OpenRouter":
1522
- summary = summarize_with_openrouter(api_key, chunk, custom_prompt_input, temp, system_message)
1523
- elif api_name == "Llama.cpp":
1524
- summary = summarize_with_llama(chunk, custom_prompt_input, temp, system_message)
1525
- elif api_name == "Kobold":
1526
- summary = summarize_with_kobold(chunk, custom_prompt_input, temp, system_message)
1527
- elif api_name == "Ooba":
1528
- summary = summarize_with_oobabooga(chunk, custom_prompt_input, temp, system_message)
1529
- elif api_name == "Tabbyapi":
1530
- summary = summarize_with_tabbyapi(chunk, custom_prompt_input, temp, system_message)
1531
- elif api_name == "VLLM":
1532
- summary = summarize_with_vllm(chunk, custom_prompt_input, temp, system_message)
1533
- summarized_chunk_transcriptions.append(summary)
1534
-
1535
- # Combine chunked transcriptions into a single file
1536
- combined_transcription_text = '\n\n'.join(chunked_transcriptions)
1537
- combined_transcription_file_path = os.path.join(download_path, 'combined_transcription.txt')
1538
- with open(combined_transcription_file_path, 'w') as f:
1539
- f.write(combined_transcription_text)
1540
-
1541
- # Combine summarized chunk transcriptions into a single file
1542
- combined_summary_text = '\n\n'.join(summarized_chunk_transcriptions)
1543
- combined_summary_file_path = os.path.join(download_path, 'combined_summary.txt')
1544
- with open(combined_summary_file_path, 'w') as f:
1545
- f.write(combined_summary_text)
1546
-
1547
- # Handle rolling summarization
1548
- if rolling_summarization:
1549
- summary_text = rolling_summarize(
1550
- text=extract_text_from_segments(segments),
1551
- detail=detail_level,
1552
- model='gpt-4-turbo',
1553
- additional_instructions=custom_prompt_input,
1554
- summarize_recursively=recursive_summarization
1555
- )
1556
- elif api_name:
1557
- summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key,
1558
- recursive_summarization, temp=None)
1559
- else:
1560
- summary_text = 'Summary not available'
1561
-
1562
- # Check to see if chunking was performed, and if so, return that instead
1563
- if chunk_text_by_words or chunk_text_by_sentences or chunk_text_by_paragraphs or chunk_text_by_tokens or chunk_by_semantic:
1564
- # Combine chunked transcriptions into a single file
1565
- # FIXME - validate this works....
1566
- json_file_path, summary_file_path = save_transcription_and_summary(combined_transcription_file_path, combined_summary_file_path, download_path, info_dict)
1567
- add_media_to_database(url, info_dict, segments, summary_text, keywords, custom_prompt_input, whisper_model)
1568
- return transcription_text, summary_text, json_file_path, summary_file_path, None, None
1569
- else:
1570
- json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text, download_path, info_dict)
1571
- add_media_to_database(url, info_dict, segments, summary_text, keywords, custom_prompt_input, whisper_model)
1572
- return transcription_text, summary_text, json_file_path, summary_file_path, None, None
1573
-
1574
- except Exception as e:
1575
- logging.error(f": {e}")
1576
- return str(e), 'process_url: Error processing the request.', None, None, None, None
1577
-
1578
- #
1579
- #
1580
- ############################################################################################################################################