oceansweep commited on
Commit
05bf4dc
·
verified ·
1 Parent(s): 852b3e2

Upload 4 files

Browse files
App_Function_Libraries/Summarization/Chain_of_Event.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Imports
3
+ #
4
+ # 3rd-party modules
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
6
+ import nltk
7
+ from nltk import sent_tokenize
8
+ from collections import Counter
9
+
10
+
11
+ # Download NLTK data
12
+ nltk.download('punkt')
13
+
14
+ # Load a pre-trained model and tokenizer for summarization
15
+ model_name = "facebook/bart-large-cnn" # You can also use "t5-base" or another model
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
18
+
19
+ # Summarization pipeline
20
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
21
+
22
+
23
+ # Step 1: Specific Event Extraction
24
+ def extract_events(text):
25
+ """
26
+ Extract events from the input text.
27
+ Here, sentences are considered as events.
28
+ """
29
+ sentences = sent_tokenize(text)
30
+ return sentences
31
+
32
+
33
+ # Step 2: Event Abstraction and Generalization
34
+ def abstract_events(events):
35
+ """
36
+ Generalize the extracted events using a summarization model.
37
+ Each event (sentence) is abstracted and summarized.
38
+ """
39
+ abstracted_events = [summarizer(event, max_length=30, min_length=10, do_sample=False)[0]['summary_text'] for event
40
+ in events]
41
+ return abstracted_events
42
+
43
+
44
+ # Step 3: Common Event Statistics
45
+ def common_events(abstracted_events):
46
+ """
47
+ Analyze the abstracted events to find out which events are most common.
48
+ """
49
+ event_counter = Counter(abstracted_events)
50
+ # Select the most common events (those that appear more than once)
51
+ common_events = [event for event, count in event_counter.items() if count > 1]
52
+ return common_events
53
+
54
+
55
+ # Step 4: Summary Generation
56
+ def generate_summary(common_events):
57
+ """
58
+ Generate a concise summary from the most common events.
59
+ """
60
+ combined_text = " ".join(common_events)
61
+ summary = summarizer(combined_text, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
62
+ return summary
63
+
64
+
65
+ # Chain-of-Event Prompting Process
66
+ def chain_of_event_prompting(texts):
67
+ """
68
+ Full Chain-of-Event Prompting workflow:
69
+ 1. Extract events from multiple texts.
70
+ 2. Generalize and abstract the events.
71
+ 3. Analyze the commonality of the events.
72
+ 4. Generate a summary from the common events.
73
+ """
74
+ all_events = []
75
+ for text in texts:
76
+ events = extract_events(text)
77
+ abstracted_events = abstract_events(events)
78
+ all_events.extend(abstracted_events)
79
+
80
+ common_events_list = common_events(all_events)
81
+ summary = generate_summary(common_events_list)
82
+
83
+ return summary
84
+
85
+
86
+ # Example Usage
87
+ if __name__ == "__main__":
88
+ # Example input texts
89
+ texts = [
90
+ "The company announced a new product line which will be launched next month.",
91
+ "A new product line is being developed by the company, with a launch expected in the near future.",
92
+ "Next month, the company plans to introduce a new series of products to the market."
93
+ ]
94
+
95
+ # Perform Chain-of-Event Prompting
96
+ final_summary = chain_of_event_prompting(texts)
97
+ print("Final Summary:", final_summary)
App_Function_Libraries/Summarization/Local_Summarization_Lib.py ADDED
@@ -0,0 +1,837 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local_Summarization_Lib.py
2
+ #########################################
3
+ # Local Summarization Library
4
+ # This library is used to perform summarization with a 'local' inference engine.
5
+ #
6
+ ####
7
+ #
8
+ ####################
9
+ # Function List
10
+ # FIXME - UPDATE Function Arguments
11
+ # 1. summarize_with_local_llm(text, custom_prompt_arg)
12
+ # 2. summarize_with_llama(api_url, text, token, custom_prompt)
13
+ # 3. summarize_with_kobold(api_url, text, kobold_api_token, custom_prompt)
14
+ # 4. summarize_with_oobabooga(api_url, text, ooba_api_token, custom_prompt)
15
+ # 5. summarize_with_vllm(vllm_api_url, vllm_api_key_function_arg, llm_model, text, vllm_custom_prompt_function_arg)
16
+ # 6. summarize_with_tabbyapi(tabby_api_key, tabby_api_IP, text, tabby_model, custom_prompt)
17
+ # 7. save_summary_to_file(summary, file_path)
18
+ #
19
+ ###############################
20
+ # Import necessary libraries
21
+ import json
22
+ import logging
23
+ import os
24
+ from typing import Union
25
+
26
+ import requests
27
+ # Import 3rd-party Libraries
28
+ # Import Local
29
+ from App_Function_Libraries.Utils.Utils import load_and_log_configs, extract_text_from_segments
30
+ #
31
+ #######################################################################################################################
32
+ # Function Definitions
33
+ #
34
+
35
+ logger = logging.getLogger()
36
+
37
+
38
+ # FIXME - temp is not used
39
+ def summarize_with_local_llm(input_data, custom_prompt_arg, temp, system_message=None):
40
+ try:
41
+ if isinstance(input_data, str) and os.path.isfile(input_data):
42
+ logging.debug("Local LLM: Loading json data for summarization")
43
+ with open(input_data, 'r') as file:
44
+ data = json.load(file)
45
+ else:
46
+ logging.debug("openai: Using provided string data for summarization")
47
+ data = input_data
48
+
49
+ logging.debug(f"Local LLM: Loaded data: {data}")
50
+ logging.debug(f"Local LLM: Type of data: {type(data)}")
51
+
52
+ if isinstance(data, dict) and 'summary' in data:
53
+ # If the loaded data is a dictionary and already contains a summary, return it
54
+ logging.debug("Local LLM: Summary already exists in the loaded data")
55
+ return data['summary']
56
+
57
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
58
+ if isinstance(data, list):
59
+ segments = data
60
+ text = extract_text_from_segments(segments)
61
+ elif isinstance(data, str):
62
+ text = data
63
+ else:
64
+ raise ValueError("Invalid input data format")
65
+
66
+ if system_message is None:
67
+ system_message = "You are a helpful AI assistant."
68
+
69
+ headers = {
70
+ 'Content-Type': 'application/json'
71
+ }
72
+
73
+ logging.debug("Local LLM: Preparing data + prompt for submittal")
74
+ local_llm_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
75
+ data = {
76
+ "messages": [
77
+ {
78
+ "role": "system",
79
+ "content": system_message
80
+ },
81
+ {
82
+ "role": "user",
83
+ "content": local_llm_prompt
84
+ }
85
+ ],
86
+ "max_tokens": 28000, # Adjust tokens as needed
87
+ }
88
+ logging.debug("Local LLM: Posting request")
89
+ response = requests.post('http://127.0.0.1:8080/v1/chat/completions', headers=headers, json=data)
90
+
91
+ if response.status_code == 200:
92
+ response_data = response.json()
93
+ if 'choices' in response_data and len(response_data['choices']) > 0:
94
+ summary = response_data['choices'][0]['message']['content'].strip()
95
+ logging.debug("Local LLM: Summarization successful")
96
+ print("Local LLM: Summarization successful.")
97
+ return summary
98
+ else:
99
+ logging.warning("Local LLM: Summary not found in the response data")
100
+ return "Local LLM: Summary not available"
101
+ else:
102
+ logging.debug("Local LLM: Summarization failed")
103
+ print("Local LLM: Failed to process summary:", response.text)
104
+ return "Local LLM: Failed to process summary"
105
+ except Exception as e:
106
+ logging.debug("Local LLM: Error in processing: %s", str(e))
107
+ print("Error occurred while processing summary with Local LLM:", str(e))
108
+ return "Local LLM: Error occurred while processing summary"
109
+
110
+
111
+ def summarize_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:8080/completion", api_key=None, temp=None, system_message=None):
112
+ try:
113
+ logging.debug("Llama.cpp: Loading and validating configurations")
114
+ loaded_config_data = load_and_log_configs()
115
+ if loaded_config_data is None:
116
+ logging.error("Failed to load configuration data")
117
+ llama_api_key = None
118
+ else:
119
+ # Prioritize the API key passed as a parameter
120
+ if api_key and api_key.strip():
121
+ llama_api_key = api_key
122
+ logging.info("Llama.cpp: Using API key provided as parameter")
123
+ else:
124
+ # If no parameter is provided, use the key from the config
125
+ llama_api_key = loaded_config_data['api_keys'].get('llama')
126
+ if llama_api_key:
127
+ logging.info("Llama.cpp: Using API key from config file")
128
+ else:
129
+ logging.warning("Llama.cpp: No API key found in config file")
130
+
131
+ # Load transcript
132
+ logging.debug("llama.cpp: Loading JSON data")
133
+ if isinstance(input_data, str) and os.path.isfile(input_data):
134
+ logging.debug("Llama.cpp: Loading json data for summarization")
135
+ with open(input_data, 'r') as file:
136
+ data = json.load(file)
137
+ else:
138
+ logging.debug("Llama.cpp: Using provided string data for summarization")
139
+ data = input_data
140
+
141
+ logging.debug(f"Llama.cpp: Loaded data: {data}")
142
+ logging.debug(f"Llama.cpp: Type of data: {type(data)}")
143
+
144
+ if isinstance(data, dict) and 'summary' in data:
145
+ # If the loaded data is a dictionary and already contains a summary, return it
146
+ logging.debug("Llama.cpp: Summary already exists in the loaded data")
147
+ return data['summary']
148
+
149
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
150
+ if isinstance(data, list):
151
+ segments = data
152
+ text = extract_text_from_segments(segments)
153
+ elif isinstance(data, str):
154
+ text = data
155
+ else:
156
+ raise ValueError("Llama.cpp: Invalid input data format")
157
+
158
+ headers = {
159
+ 'accept': 'application/json',
160
+ 'content-type': 'application/json',
161
+ }
162
+ if len(api_key) > 5:
163
+ headers['Authorization'] = f'Bearer {api_key}'
164
+
165
+ llama_prompt = f"{custom_prompt} \n\n\n\n{text}"
166
+ if system_message is None:
167
+ system_message = "You are a helpful AI assistant."
168
+ logging.debug("llama: Prompt being sent is {llama_prompt}")
169
+ if system_message is None:
170
+ system_message = "You are a helpful AI assistant."
171
+
172
+ data = {
173
+ "messages": [
174
+ {"role": "system", "content": system_message},
175
+ {"role": "user", "content": llama_prompt}
176
+ ],
177
+ "max_tokens": 4096,
178
+ "temperature": temp
179
+ }
180
+
181
+ logging.debug("llama: Submitting request to API endpoint")
182
+ print("llama: Submitting request to API endpoint")
183
+ response = requests.post(api_url, headers=headers, json=data)
184
+ response_data = response.json()
185
+ logging.debug("API Response Data: %s", response_data)
186
+
187
+ if response.status_code == 200:
188
+ # if 'X' in response_data:
189
+ logging.debug(response_data)
190
+ summary = response_data['content'].strip()
191
+ logging.debug("llama: Summarization successful")
192
+ print("Summarization successful.")
193
+ return summary
194
+ else:
195
+ logging.error(f"Llama: API request failed with status code {response.status_code}: {response.text}")
196
+ return f"Llama: API request failed: {response.text}"
197
+
198
+ except Exception as e:
199
+ logging.error("Llama: Error in processing: %s", str(e))
200
+ return f"Llama: Error occurred while processing summary with llama: {str(e)}"
201
+
202
+
203
+ # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
204
+ def summarize_with_kobold(input_data, api_key, custom_prompt_input, kobold_api_ip="http://127.0.0.1:5001/api/v1/generate", temp=None, system_message=None):
205
+ logging.debug("Kobold: Summarization process starting...")
206
+ try:
207
+ logging.debug("Kobold: Loading and validating configurations")
208
+ loaded_config_data = load_and_log_configs()
209
+ if loaded_config_data is None:
210
+ logging.error("Failed to load configuration data")
211
+ kobold_api_key = None
212
+ else:
213
+ # Prioritize the API key passed as a parameter
214
+ if api_key and api_key.strip():
215
+ kobold_api_key = api_key
216
+ logging.info("Kobold: Using API key provided as parameter")
217
+ else:
218
+ # If no parameter is provided, use the key from the config
219
+ kobold_api_key = loaded_config_data['api_keys'].get('kobold')
220
+ if kobold_api_key:
221
+ logging.info("Kobold: Using API key from config file")
222
+ else:
223
+ logging.warning("Kobold: No API key found in config file")
224
+
225
+ logging.debug(f"Kobold: Using API Key: {kobold_api_key[:5]}...{kobold_api_key[-5:]}")
226
+
227
+ if isinstance(input_data, str) and os.path.isfile(input_data):
228
+ logging.debug("Kobold.cpp: Loading json data for summarization")
229
+ with open(input_data, 'r') as file:
230
+ data = json.load(file)
231
+ else:
232
+ logging.debug("Kobold.cpp: Using provided string data for summarization")
233
+ data = input_data
234
+
235
+ logging.debug(f"Kobold.cpp: Loaded data: {data}")
236
+ logging.debug(f"Kobold.cpp: Type of data: {type(data)}")
237
+
238
+ if isinstance(data, dict) and 'summary' in data:
239
+ # If the loaded data is a dictionary and already contains a summary, return it
240
+ logging.debug("Kobold.cpp: Summary already exists in the loaded data")
241
+ return data['summary']
242
+
243
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
244
+ if isinstance(data, list):
245
+ segments = data
246
+ text = extract_text_from_segments(segments)
247
+ elif isinstance(data, str):
248
+ text = data
249
+ else:
250
+ raise ValueError("Kobold.cpp: Invalid input data format")
251
+
252
+ headers = {
253
+ 'accept': 'application/json',
254
+ 'content-type': 'application/json',
255
+ }
256
+
257
+ kobold_prompt = f"{custom_prompt_input}\n\n\n\n{text}"
258
+ logging.debug("kobold: Prompt being sent is {kobold_prompt}")
259
+
260
+ # FIXME
261
+ # Values literally c/p from the api docs....
262
+ data = {
263
+ "max_context_length": 8096,
264
+ "max_length": 4096,
265
+ "prompt": kobold_prompt,
266
+ "temperature": 0.7,
267
+ #"top_p": 0.9,
268
+ #"top_k": 100
269
+ #"rep_penalty": 1.0,
270
+ }
271
+
272
+ logging.debug("kobold: Submitting request to API endpoint")
273
+ print("kobold: Submitting request to API endpoint")
274
+ kobold_api_ip = loaded_config_data['local_api_ip']['kobold']
275
+ try:
276
+ response = requests.post(kobold_api_ip, headers=headers, json=data)
277
+ logging.debug("kobold: API Response Status Code: %d", response.status_code)
278
+
279
+ if response.status_code == 200:
280
+ try:
281
+ response_data = response.json()
282
+ logging.debug("kobold: API Response Data: %s", response_data)
283
+
284
+ if response_data and 'results' in response_data and len(response_data['results']) > 0:
285
+ summary = response_data['results'][0]['text'].strip()
286
+ logging.debug("kobold: Summarization successful")
287
+ return summary
288
+ else:
289
+ logging.error("Expected data not found in API response.")
290
+ return "Expected data not found in API response."
291
+ except ValueError as e:
292
+ logging.error("kobold: Error parsing JSON response: %s", str(e))
293
+ return f"Error parsing JSON response: {str(e)}"
294
+ else:
295
+ logging.error(f"kobold: API request failed with status code {response.status_code}: {response.text}")
296
+ return f"kobold: API request failed: {response.text}"
297
+ except Exception as e:
298
+ logging.error("kobold: Error in processing: %s", str(e))
299
+ return f"kobold: Error occurred while processing summary with kobold: {str(e)}"
300
+ except Exception as e:
301
+ logging.error("kobold: Error in processing: %s", str(e))
302
+ return f"kobold: Error occurred while processing summary with kobold: {str(e)}"
303
+
304
+
305
+ # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
306
+ def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url="http://127.0.0.1:5000/v1/chat/completions", temp=None, system_message=None):
307
+ logging.debug("Oobabooga: Summarization process starting...")
308
+ try:
309
+ logging.debug("Oobabooga: Loading and validating configurations")
310
+ loaded_config_data = load_and_log_configs()
311
+ if loaded_config_data is None:
312
+ logging.error("Failed to load configuration data")
313
+ ooba_api_key = None
314
+ else:
315
+ # Prioritize the API key passed as a parameter
316
+ if api_key and api_key.strip():
317
+ ooba_api_key = api_key
318
+ logging.info("Oobabooga: Using API key provided as parameter")
319
+ else:
320
+ # If no parameter is provided, use the key from the config
321
+ ooba_api_key = loaded_config_data['api_keys'].get('ooba')
322
+ if ooba_api_key:
323
+ logging.info("Anthropic: Using API key from config file")
324
+ else:
325
+ logging.warning("Anthropic: No API key found in config file")
326
+
327
+ logging.debug(f"Oobabooga: Using API Key: {ooba_api_key[:5]}...{ooba_api_key[-5:]}")
328
+
329
+ if isinstance(input_data, str) and os.path.isfile(input_data):
330
+ logging.debug("Oobabooga: Loading json data for summarization")
331
+ with open(input_data, 'r') as file:
332
+ data = json.load(file)
333
+ else:
334
+ logging.debug("Oobabooga: Using provided string data for summarization")
335
+ data = input_data
336
+
337
+ logging.debug(f"Oobabooga: Loaded data: {data}")
338
+ logging.debug(f"Oobabooga: Type of data: {type(data)}")
339
+
340
+ if isinstance(data, dict) and 'summary' in data:
341
+ # If the loaded data is a dictionary and already contains a summary, return it
342
+ logging.debug("Oobabooga: Summary already exists in the loaded data")
343
+ return data['summary']
344
+
345
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
346
+ if isinstance(data, list):
347
+ segments = data
348
+ text = extract_text_from_segments(segments)
349
+ elif isinstance(data, str):
350
+ text = data
351
+ else:
352
+ raise ValueError("Invalid input data format")
353
+
354
+ headers = {
355
+ 'accept': 'application/json',
356
+ 'content-type': 'application/json',
357
+ }
358
+
359
+ # prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a French bakery baking cakes. It
360
+ # is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are
361
+ # my favorite." prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable
362
+ ooba_prompt = f"{text}" + f"\n\n\n\n{custom_prompt}"
363
+ logging.debug("ooba: Prompt being sent is {ooba_prompt}")
364
+
365
+ if system_message is None:
366
+ system_message = "You are a helpful AI assistant."
367
+
368
+ data = {
369
+ "mode": "chat",
370
+ "character": "Example",
371
+ "messages": [{"role": "user", "content": ooba_prompt}],
372
+ "system_message": system_message,
373
+ }
374
+
375
+ logging.debug("ooba: Submitting request to API endpoint")
376
+ print("ooba: Submitting request to API endpoint")
377
+ response = requests.post(api_url, headers=headers, json=data, verify=False)
378
+ logging.debug("ooba: API Response Data: %s", response)
379
+
380
+ if response.status_code == 200:
381
+ response_data = response.json()
382
+ summary = response.json()['choices'][0]['message']['content']
383
+ logging.debug("ooba: Summarization successful")
384
+ print("Summarization successful.")
385
+ return summary
386
+ else:
387
+ logging.error(f"oobabooga: API request failed with status code {response.status_code}: {response.text}")
388
+ return f"ooba: API request failed with status code {response.status_code}: {response.text}"
389
+
390
+ except Exception as e:
391
+ logging.error("ooba: Error in processing: %s", str(e))
392
+ return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
393
+
394
+
395
+
396
+ def summarize_with_tabbyapi(input_data, custom_prompt_input, api_key=None, api_IP="http://127.0.0.1:5000/v1/chat/completions", temp=None, system_message=None):
397
+ logging.debug("TabbyAPI: Summarization process starting...")
398
+ try:
399
+ logging.debug("TabbyAPI: Loading and validating configurations")
400
+ loaded_config_data = load_and_log_configs()
401
+ if loaded_config_data is None:
402
+ logging.error("Failed to load configuration data")
403
+ tabby_api_key = None
404
+ else:
405
+ # Prioritize the API key passed as a parameter
406
+ if api_key and api_key.strip():
407
+ tabby_api_key = api_key
408
+ logging.info("TabbyAPI: Using API key provided as parameter")
409
+ else:
410
+ # If no parameter is provided, use the key from the config
411
+ tabby_api_key = loaded_config_data['api_keys'].get('tabby')
412
+ if tabby_api_key:
413
+ logging.info("TabbyAPI: Using API key from config file")
414
+ else:
415
+ logging.warning("TabbyAPI: No API key found in config file")
416
+
417
+ tabby_api_ip = loaded_config_data['local_api_ip']['tabby']
418
+ tabby_model = loaded_config_data['models']['tabby']
419
+ if temp is None:
420
+ temp = 0.7
421
+
422
+ logging.debug(f"TabbyAPI: Using API Key: {tabby_api_key[:5]}...{tabby_api_key[-5:]}")
423
+
424
+ if isinstance(input_data, str) and os.path.isfile(input_data):
425
+ logging.debug("tabby: Loading json data for summarization")
426
+ with open(input_data, 'r') as file:
427
+ data = json.load(file)
428
+ else:
429
+ logging.debug("tabby: Using provided string data for summarization")
430
+ data = input_data
431
+
432
+ logging.debug(f"tabby: Loaded data: {data}")
433
+ logging.debug(f"tabby: Type of data: {type(data)}")
434
+
435
+ if isinstance(data, dict) and 'summary' in data:
436
+ # If the loaded data is a dictionary and already contains a summary, return it
437
+ logging.debug("tabby: Summary already exists in the loaded data")
438
+ return data['summary']
439
+
440
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
441
+ if isinstance(data, list):
442
+ segments = data
443
+ text = extract_text_from_segments(segments)
444
+ elif isinstance(data, str):
445
+ text = data
446
+ else:
447
+ raise ValueError("Invalid input data format")
448
+ if system_message is None:
449
+ system_message = "You are a helpful AI assistant."
450
+
451
+ headers = {
452
+ 'Authorization': f'Bearer {api_key}',
453
+ 'Content-Type': 'application/json'
454
+ }
455
+ data2 = {
456
+ 'max_tokens': 4096,
457
+ "min_tokens": 0,
458
+ 'temperature': temp,
459
+ #'top_p': 1.0,
460
+ #'top_k': 0,
461
+ #'frequency_penalty': 0,
462
+ #'presence_penalty': 0.0,
463
+ #"repetition_penalty": 1.0,
464
+ 'model': tabby_model,
465
+ 'user': custom_prompt_input,
466
+ 'messages': input_data
467
+ }
468
+
469
+ response = requests.post(tabby_api_ip, headers=headers, json=data2)
470
+
471
+ if response.status_code == 200:
472
+ response_json = response.json()
473
+
474
+ # Validate the response structure
475
+ if all(key in response_json for key in ['id', 'choices', 'created', 'model', 'object', 'usage']):
476
+ logging.info("TabbyAPI: Received a valid 200 response")
477
+ summary = response_json['choices'][0].get('message', {}).get('content', '')
478
+ return summary
479
+ else:
480
+ logging.error("TabbyAPI: Received a 200 response, but the structure is invalid")
481
+ return "Error: Received an invalid response structure from TabbyAPI."
482
+
483
+ elif response.status_code == 422:
484
+ logging.error(f"TabbyAPI: Received a 422 error. Details: {response.json()}")
485
+ return "Error: Invalid request sent to TabbyAPI."
486
+
487
+ else:
488
+ response.raise_for_status() # This will raise an exception for other status codes
489
+
490
+ except requests.exceptions.RequestException as e:
491
+ logging.error(f"Error summarizing with TabbyAPI: {e}")
492
+ return f"Error summarizing with TabbyAPI: {str(e)}"
493
+ except json.JSONDecodeError:
494
+ logging.error("TabbyAPI: Received an invalid JSON response")
495
+ return "Error: Received an invalid JSON response from TabbyAPI."
496
+ except Exception as e:
497
+ logging.error(f"Unexpected error in summarize_with_tabbyapi: {e}")
498
+ return f"Unexpected error in summarization process: {str(e)}"
499
+
500
+ def summarize_with_vllm(
501
+ input_data: Union[str, dict, list],
502
+ custom_prompt_input: str,
503
+ api_key: str = None,
504
+ vllm_api_url: str = "http://127.0.0.1:8000/v1/chat/completions",
505
+ model: str = None,
506
+ system_prompt: str = None,
507
+ temp: float = 0.7
508
+ ) -> str:
509
+ logging.debug("vLLM: Summarization process starting...")
510
+ try:
511
+ logging.debug("vLLM: Loading and validating configurations")
512
+ loaded_config_data = load_and_log_configs()
513
+ if loaded_config_data is None:
514
+ logging.error("Failed to load configuration data")
515
+ vllm_api_key = None
516
+ else:
517
+ # Prioritize the API key passed as a parameter
518
+ if api_key and api_key.strip():
519
+ vllm_api_key = api_key
520
+ logging.info("vLLM: Using API key provided as parameter")
521
+ else:
522
+ # If no parameter is provided, use the key from the config
523
+ vllm_api_key = loaded_config_data['api_keys'].get('vllm')
524
+ if vllm_api_key:
525
+ logging.info("vLLM: Using API key from config file")
526
+ else:
527
+ logging.warning("vLLM: No API key found in config file")
528
+
529
+ logging.debug(f"vLLM: Using API Key: {vllm_api_key[:5]}...{vllm_api_key[-5:]}")
530
+ # Process input data
531
+ if isinstance(input_data, str) and os.path.isfile(input_data):
532
+ logging.debug("vLLM: Loading json data for summarization")
533
+ with open(input_data, 'r') as file:
534
+ data = json.load(file)
535
+ else:
536
+ logging.debug("vLLM: Using provided data for summarization")
537
+ data = input_data
538
+
539
+ logging.debug(f"vLLM: Type of data: {type(data)}")
540
+
541
+ # Extract text for summarization
542
+ if isinstance(data, dict) and 'summary' in data:
543
+ logging.debug("vLLM: Summary already exists in the loaded data")
544
+ return data['summary']
545
+ elif isinstance(data, list):
546
+ text = extract_text_from_segments(data)
547
+ elif isinstance(data, str):
548
+ text = data
549
+ elif isinstance(data, dict):
550
+ text = json.dumps(data)
551
+ else:
552
+ raise ValueError("Invalid input data format")
553
+
554
+ logging.debug(f"vLLM: Extracted text (showing first 500 chars): {text[:500]}...")
555
+
556
+ if system_prompt is None:
557
+ system_prompt = "You are a helpful AI assistant."
558
+
559
+ model = model or loaded_config_data['models']['vllm']
560
+ if system_prompt is None:
561
+ system_prompt = "You are a helpful AI assistant."
562
+
563
+ # Prepare the API request
564
+ headers = {
565
+ "Content-Type": "application/json"
566
+ }
567
+
568
+ payload = {
569
+ "model": model,
570
+ "messages": [
571
+ {"role": "system", "content": system_prompt},
572
+ {"role": "user", "content": f"{custom_prompt_input}\n\n{text}"}
573
+ ]
574
+ }
575
+
576
+ # Make the API call
577
+ logging.debug(f"vLLM: Sending request to {vllm_api_url}")
578
+ response = requests.post(vllm_api_url, headers=headers, json=payload)
579
+
580
+ # Check for successful response
581
+ response.raise_for_status()
582
+
583
+ # Extract and return the summary
584
+ response_data = response.json()
585
+ if 'choices' in response_data and len(response_data['choices']) > 0:
586
+ summary = response_data['choices'][0]['message']['content']
587
+ logging.debug("vLLM: Summarization successful")
588
+ logging.debug(f"vLLM: Summary (first 500 chars): {summary[:500]}...")
589
+ return summary
590
+ else:
591
+ raise ValueError("Unexpected response format from vLLM API")
592
+
593
+ except requests.RequestException as e:
594
+ logging.error(f"vLLM: API request failed: {str(e)}")
595
+ return f"Error: vLLM API request failed - {str(e)}"
596
+ except json.JSONDecodeError as e:
597
+ logging.error(f"vLLM: Failed to parse API response: {str(e)}")
598
+ return f"Error: Failed to parse vLLM API response - {str(e)}"
599
+ except Exception as e:
600
+ logging.error(f"vLLM: Unexpected error during summarization: {str(e)}")
601
+ return f"Error: Unexpected error during vLLM summarization - {str(e)}"
602
+
603
+
604
+ # FIXME - update to be a summarize request
605
+ def summarize_with_ollama(input_data, custom_prompt, api_url="http://127.0.0.1:11434/api/generate", api_key=None, temp=None, system_message=None, model=None):
606
+ try:
607
+ logging.debug("ollama: Loading and validating configurations")
608
+ loaded_config_data = load_and_log_configs()
609
+ if loaded_config_data is None:
610
+ logging.error("Failed to load configuration data")
611
+ ollama_api_key = None
612
+ else:
613
+ # Prioritize the API key passed as a parameter
614
+ if api_key and api_key.strip():
615
+ ollama_api_key = api_key
616
+ logging.info("Ollama: Using API key provided as parameter")
617
+ else:
618
+ # If no parameter is provided, use the key from the config
619
+ ollama_api_key = loaded_config_data['api_keys'].get('ollama')
620
+ if ollama_api_key:
621
+ logging.info("Ollama: Using API key from config file")
622
+ else:
623
+ logging.warning("Ollama: No API key found in config file")
624
+
625
+ model = loaded_config_data['models']['ollama']
626
+
627
+ # Load transcript
628
+ logging.debug("Ollama: Loading JSON data")
629
+ if isinstance(input_data, str) and os.path.isfile(input_data):
630
+ logging.debug("Ollama: Loading json data for summarization")
631
+ with open(input_data, 'r') as file:
632
+ data = json.load(file)
633
+ else:
634
+ logging.debug("Ollama: Using provided string data for summarization")
635
+ data = input_data
636
+
637
+ logging.debug(f"Ollama: Loaded data: {data}")
638
+ logging.debug(f"Ollama: Type of data: {type(data)}")
639
+
640
+ if isinstance(data, dict) and 'summary' in data:
641
+ # If the loaded data is a dictionary and already contains a summary, return it
642
+ logging.debug("Ollama: Summary already exists in the loaded data")
643
+ return data['summary']
644
+
645
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
646
+ if isinstance(data, list):
647
+ segments = data
648
+ text = extract_text_from_segments(segments)
649
+ elif isinstance(data, str):
650
+ text = data
651
+ else:
652
+ raise ValueError("Ollama: Invalid input data format")
653
+
654
+ headers = {
655
+ 'accept': 'application/json',
656
+ 'content-type': 'application/json',
657
+ }
658
+ if len(ollama_api_key) > 5:
659
+ headers['Authorization'] = f'Bearer {ollama_api_key}'
660
+
661
+ ollama_prompt = f"{custom_prompt} \n\n\n\n{text}"
662
+ if system_message is None:
663
+ system_message = "You are a helpful AI assistant."
664
+ logging.debug(f"llama: Prompt being sent is {ollama_prompt}")
665
+ if system_message is None:
666
+ system_message = "You are a helpful AI assistant."
667
+
668
+ data = {
669
+ "model": model,
670
+ "messages": [
671
+ {"role": "system",
672
+ "content": system_message
673
+ },
674
+ {"role": "user",
675
+ "content": ollama_prompt
676
+ }
677
+ ],
678
+ }
679
+
680
+ logging.debug("Ollama: Submitting request to API endpoint")
681
+ print("Ollama: Submitting request to API endpoint")
682
+ response = requests.post(api_url, headers=headers, json=data)
683
+ response_data = response.json()
684
+ logging.debug("API Response Data: %s", response_data)
685
+
686
+ if response.status_code == 200:
687
+ # if 'X' in response_data:
688
+ logging.debug(response_data)
689
+ summary = response_data['content'].strip()
690
+ logging.debug("Ollama: Summarization successful")
691
+ print("Summarization successful.")
692
+ return summary
693
+ else:
694
+ logging.error(f"Ollama: API request failed with status code {response.status_code}: {response.text}")
695
+ return f"Ollama: API request failed: {response.text}"
696
+
697
+ except Exception as e:
698
+ logging.error("Ollama: Error in processing: %s", str(e))
699
+ return f"Ollama: Error occurred while processing summary with ollama: {str(e)}"
700
+
701
+
702
+ # FIXME - update to be a summarize request
703
+ def summarize_with_custom_openai(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
704
+ loaded_config_data = load_and_log_configs()
705
+ custom_openai_api_key = api_key
706
+ try:
707
+ # API key validation
708
+ if not custom_openai_api_key:
709
+ logging.info("Custom OpenAI API: API key not provided as parameter")
710
+ logging.info("Custom OpenAI API: Attempting to use API key from config file")
711
+ custom_openai_api_key = loaded_config_data['api_keys']['custom_openai_api_key']
712
+
713
+ if not custom_openai_api_key:
714
+ logging.error("Custom OpenAI API: API key not found or is empty")
715
+ return "Custom OpenAI API: API Key Not Provided/Found in Config file or is empty"
716
+
717
+ logging.debug(f"Custom OpenAI API: Using API Key: {custom_openai_api_key[:5]}...{custom_openai_api_key[-5:]}")
718
+
719
+ # Input data handling
720
+ logging.debug(f"Custom OpenAI API: Raw input data type: {type(input_data)}")
721
+ logging.debug(f"Custom OpenAI API: Raw input data (first 500 chars): {str(input_data)[:500]}...")
722
+
723
+ if isinstance(input_data, str):
724
+ if input_data.strip().startswith('{'):
725
+ # It's likely a JSON string
726
+ logging.debug("Custom OpenAI API: Parsing provided JSON string data for summarization")
727
+ try:
728
+ data = json.loads(input_data)
729
+ except json.JSONDecodeError as e:
730
+ logging.error(f"Custom OpenAI API: Error parsing JSON string: {str(e)}")
731
+ return f"Custom OpenAI API: Error parsing JSON input: {str(e)}"
732
+ elif os.path.isfile(input_data):
733
+ logging.debug("Custom OpenAI API: Loading JSON data from file for summarization")
734
+ with open(input_data, 'r') as file:
735
+ data = json.load(file)
736
+ else:
737
+ logging.debug("Custom OpenAI API: Using provided string data for summarization")
738
+ data = input_data
739
+ else:
740
+ data = input_data
741
+
742
+ logging.debug(f"Custom OpenAI API: Processed data type: {type(data)}")
743
+ logging.debug(f"Custom OpenAI API: Processed data (first 500 chars): {str(data)[:500]}...")
744
+
745
+ # Text extraction
746
+ if isinstance(data, dict):
747
+ if 'summary' in data:
748
+ logging.debug("Custom OpenAI API: Summary already exists in the loaded data")
749
+ return data['summary']
750
+ elif 'segments' in data:
751
+ text = extract_text_from_segments(data['segments'])
752
+ else:
753
+ text = json.dumps(data) # Convert dict to string if no specific format
754
+ elif isinstance(data, list):
755
+ text = extract_text_from_segments(data)
756
+ elif isinstance(data, str):
757
+ text = data
758
+ else:
759
+ raise ValueError(f"Custom OpenAI API: Invalid input data format: {type(data)}")
760
+
761
+ logging.debug(f"Custom OpenAI API: Extracted text (first 500 chars): {text[:500]}...")
762
+ logging.debug(f"v: Custom prompt: {custom_prompt_arg}")
763
+
764
+ openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
765
+ logging.debug(f"Custom OpenAI API: Using model: {openai_model}")
766
+
767
+ headers = {
768
+ 'Authorization': f'Bearer {custom_openai_api_key}',
769
+ 'Content-Type': 'application/json'
770
+ }
771
+
772
+ logging.debug(
773
+ f"OpenAI API Key: {custom_openai_api_key[:5]}...{custom_openai_api_key[-5:] if custom_openai_api_key else None}")
774
+ logging.debug("Custom OpenAI API: Preparing data + prompt for submittal")
775
+ openai_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
776
+ if temp is None:
777
+ temp = 0.7
778
+ if system_message is None:
779
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
780
+ temp = float(temp)
781
+ data = {
782
+ "model": openai_model,
783
+ "messages": [
784
+ {"role": "system", "content": system_message},
785
+ {"role": "user", "content": openai_prompt}
786
+ ],
787
+ "max_tokens": 4096,
788
+ "temperature": temp
789
+ }
790
+
791
+ custom_openai_url = loaded_config_data['Local_api_ip']['custom_openai_api_ip']
792
+
793
+ logging.debug("Custom OpenAI API: Posting request")
794
+ response = requests.post(custom_openai_url, headers=headers, json=data)
795
+ logging.debug(f"Custom OpenAI API full API response data: {response}")
796
+ if response.status_code == 200:
797
+ response_data = response.json()
798
+ logging.debug(response_data)
799
+ if 'choices' in response_data and len(response_data['choices']) > 0:
800
+ chat_response = response_data['choices'][0]['message']['content'].strip()
801
+ logging.debug("Custom OpenAI API: Chat Sent successfully")
802
+ logging.debug(f"Custom OpenAI API: Chat response: {chat_response}")
803
+ return chat_response
804
+ else:
805
+ logging.warning("Custom OpenAI API: Chat response not found in the response data")
806
+ return "Custom OpenAI API: Chat not available"
807
+ else:
808
+ logging.error(f"Custom OpenAI API: Chat request failed with status code {response.status_code}")
809
+ logging.error(f"Custom OpenAI API: Error response: {response.text}")
810
+ return f"OpenAI: Failed to process chat response. Status code: {response.status_code}"
811
+ except json.JSONDecodeError as e:
812
+ logging.error(f"Custom OpenAI API: Error decoding JSON: {str(e)}", exc_info=True)
813
+ return f"Custom OpenAI API: Error decoding JSON input: {str(e)}"
814
+ except requests.RequestException as e:
815
+ logging.error(f"Custom OpenAI API: Error making API request: {str(e)}", exc_info=True)
816
+ return f"Custom OpenAI API: Error making API request: {str(e)}"
817
+ except Exception as e:
818
+ logging.error(f"Custom OpenAI API: Unexpected error: {str(e)}", exc_info=True)
819
+ return f"Custom OpenAI API: Unexpected error occurred: {str(e)}"
820
+
821
+
822
+ def save_summary_to_file(summary, file_path):
823
+ logging.debug("Now saving summary to file...")
824
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
825
+ summary_file_path = os.path.join(os.path.dirname(file_path), base_name + '_summary.txt')
826
+ os.makedirs(os.path.dirname(summary_file_path), exist_ok=True)
827
+ logging.debug("Opening summary file for writing, *segments.json with *_summary.txt")
828
+ with open(summary_file_path, 'w') as file:
829
+ file.write(summary)
830
+ logging.info(f"Summary saved to file: {summary_file_path}")
831
+
832
+ #
833
+ #
834
+ #######################################################################################################################
835
+
836
+
837
+
App_Function_Libraries/Summarization/Summarization_General_Lib.py ADDED
@@ -0,0 +1,1580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Summarization_General_Lib.py
2
+ #########################################
3
+ # General Summarization Library
4
+ # This library is used to perform summarization.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. summarize_with_openai(api_key, file_path, custom_prompt_arg)
12
+ # 3. summarize_with_anthropic(api_key, file_path, model, custom_prompt_arg, max_retries=3, retry_delay=5)
13
+ # 4. summarize_with_cohere(api_key, file_path, model, custom_prompt_arg)
14
+ # 5. summarize_with_groq(api_key, file_path, model, custom_prompt_arg)
15
+ #
16
+ #
17
+ ####################
18
+ # Import necessary libraries
19
+ import json
20
+ import logging
21
+ import os
22
+ import time
23
+ from typing import Optional
24
+
25
+ import requests
26
+ from requests import RequestException
27
+
28
+ from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav, speech_to_text
29
+ from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summarize, recursive_summarize_chunks, \
30
+ improved_chunking_process
31
+ from App_Function_Libraries.Diarization_Lib import combine_transcription_and_diarization
32
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
33
+ summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
34
+ from App_Function_Libraries.DB.DB_Manager import add_media_to_database
35
+ # Import Local
36
+ from App_Function_Libraries.Utils.Utils import load_and_log_configs, load_comprehensive_config, sanitize_filename, \
37
+ clean_youtube_url, create_download_directory, is_valid_url
38
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import download_video, extract_video_info
39
+
40
+ #
41
+ #######################################################################################################################
42
+ # Function Definitions
43
+ #
44
+ config = load_comprehensive_config()
45
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
46
+
47
+
48
+ def summarize(
49
+ input_data: str,
50
+ custom_prompt_arg: Optional[str],
51
+ api_name: str,
52
+ api_key: Optional[str],
53
+ temp: Optional[float],
54
+ system_message: Optional[str]
55
+ ) -> str:
56
+ try:
57
+ logging.debug(f"api_name type: {type(api_name)}, value: {api_name}")
58
+ if api_name.lower() == "openai":
59
+ return summarize_with_openai(api_key, input_data, custom_prompt_arg, temp, system_message)
60
+ elif api_name.lower() == "anthropic":
61
+ return summarize_with_anthropic(api_key, input_data, custom_prompt_arg, temp, system_message)
62
+ elif api_name.lower() == "cohere":
63
+ return summarize_with_cohere(api_key, input_data, custom_prompt_arg, temp, system_message)
64
+ elif api_name.lower() == "groq":
65
+ return summarize_with_groq(api_key, input_data, custom_prompt_arg, temp, system_message)
66
+ elif api_name.lower() == "huggingface":
67
+ return summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp)
68
+ elif api_name.lower() == "openrouter":
69
+ return summarize_with_openrouter(api_key, input_data, custom_prompt_arg, temp, system_message)
70
+ elif api_name.lower() == "deepseek":
71
+ return summarize_with_deepseek(api_key, input_data, custom_prompt_arg, temp, system_message)
72
+ elif api_name.lower() == "mistral":
73
+ return summarize_with_mistral(api_key, input_data, custom_prompt_arg, temp, system_message)
74
+ elif api_name.lower() == "llama.cpp":
75
+ return summarize_with_llama(input_data, custom_prompt_arg, temp, system_message)
76
+ elif api_name.lower() == "kobold":
77
+ return summarize_with_kobold(input_data, api_key, custom_prompt_arg, temp, system_message)
78
+ elif api_name.lower() == "ooba":
79
+ return summarize_with_oobabooga(input_data, api_key, custom_prompt_arg, temp, system_message)
80
+ elif api_name.lower() == "tabbyapi":
81
+ return summarize_with_tabbyapi(input_data, custom_prompt_arg, temp, system_message)
82
+ elif api_name.lower() == "vllm":
83
+ return summarize_with_vllm(input_data, custom_prompt_arg, None, system_message)
84
+ elif api_name.lower() == "local-llm":
85
+ return summarize_with_local_llm(input_data, custom_prompt_arg, temp, system_message)
86
+ elif api_name.lower() == "huggingface":
87
+ return summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp, )#system_message)
88
+ else:
89
+ return f"Error: Invalid API Name {api_name}"
90
+
91
+ except Exception as e:
92
+ logging.error(f"Error in summarize function: {str(e)}", exc_info=True)
93
+ return f"Error: {str(e)}"
94
+
95
+
96
+ def extract_text_from_segments(segments):
97
+ logging.debug(f"Segments received: {segments}")
98
+ logging.debug(f"Type of segments: {type(segments)}")
99
+
100
+ text = ""
101
+
102
+ if isinstance(segments, list):
103
+ for segment in segments:
104
+ logging.debug(f"Current segment: {segment}")
105
+ logging.debug(f"Type of segment: {type(segment)}")
106
+ if 'Text' in segment:
107
+ text += segment['Text'] + " "
108
+ else:
109
+ logging.warning(f"Skipping segment due to missing 'Text' key: {segment}")
110
+ else:
111
+ logging.warning(f"Unexpected type of 'segments': {type(segments)}")
112
+
113
+ return text.strip()
114
+
115
+
116
+ def summarize_with_openai(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
117
+ loaded_config_data = load_and_log_configs()
118
+ try:
119
+ # API key validation
120
+ if not api_key or api_key.strip() == "":
121
+ logging.info("OpenAI: #1 API key not provided as parameter")
122
+ logging.info("OpenAI: Attempting to use API key from config file")
123
+ api_key = loaded_config_data['api_keys']['openai']
124
+
125
+ if not api_key or api_key.strip() == "":
126
+ logging.error("OpenAI: #2 API key not found or is empty")
127
+ return "OpenAI: API Key Not Provided/Found in Config file or is empty"
128
+
129
+ openai_api_key = api_key
130
+ logging.debug(f"OpenAI: Using API Key: {api_key[:5]}...{api_key[-5:]}")
131
+
132
+ # Input data handling
133
+ logging.debug(f"OpenAI: Raw input data type: {type(input_data)}")
134
+ logging.debug(f"OpenAI: Raw input data (first 500 chars): {str(input_data)[:500]}...")
135
+
136
+ if isinstance(input_data, str):
137
+ if input_data.strip().startswith('{'):
138
+ # It's likely a JSON string
139
+ logging.debug("OpenAI: Parsing provided JSON string data for summarization")
140
+ try:
141
+ data = json.loads(input_data)
142
+ except json.JSONDecodeError as e:
143
+ logging.error(f"OpenAI: Error parsing JSON string: {str(e)}")
144
+ return f"OpenAI: Error parsing JSON input: {str(e)}"
145
+ elif os.path.isfile(input_data):
146
+ logging.debug("OpenAI: Loading JSON data from file for summarization")
147
+ with open(input_data, 'r') as file:
148
+ data = json.load(file)
149
+ else:
150
+ logging.debug("OpenAI: Using provided string data for summarization")
151
+ data = input_data
152
+ else:
153
+ data = input_data
154
+
155
+ logging.debug(f"OpenAI: Processed data type: {type(data)}")
156
+ logging.debug(f"OpenAI: Processed data (first 500 chars): {str(data)[:500]}...")
157
+
158
+ # Text extraction
159
+ if isinstance(data, dict):
160
+ if 'summary' in data:
161
+ logging.debug("OpenAI: Summary already exists in the loaded data")
162
+ return data['summary']
163
+ elif 'segments' in data:
164
+ text = extract_text_from_segments(data['segments'])
165
+ else:
166
+ text = json.dumps(data) # Convert dict to string if no specific format
167
+ elif isinstance(data, list):
168
+ text = extract_text_from_segments(data)
169
+ elif isinstance(data, str):
170
+ text = data
171
+ else:
172
+ raise ValueError(f"OpenAI: Invalid input data format: {type(data)}")
173
+
174
+ logging.debug(f"OpenAI: Extracted text (first 500 chars): {text[:500]}...")
175
+ logging.debug(f"OpenAI: Custom prompt: {custom_prompt_arg}")
176
+
177
+ openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
178
+ logging.debug(f"OpenAI: Using model: {openai_model}")
179
+
180
+ headers = {
181
+ 'Authorization': f'Bearer {openai_api_key}',
182
+ 'Content-Type': 'application/json'
183
+ }
184
+
185
+ logging.debug(
186
+ f"OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
187
+ logging.debug("openai: Preparing data + prompt for submittal")
188
+ openai_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
189
+ if temp is None:
190
+ temp = 0.7
191
+ if system_message is None:
192
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
193
+ temp = float(temp)
194
+ data = {
195
+ "model": openai_model,
196
+ "messages": [
197
+ {"role": "system", "content": system_message},
198
+ {"role": "user", "content": openai_prompt}
199
+ ],
200
+ "max_tokens": 4096,
201
+ "temperature": temp
202
+ }
203
+
204
+ logging.debug("OpenAI: Posting request")
205
+ response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)
206
+
207
+ if response.status_code == 200:
208
+ response_data = response.json()
209
+ if 'choices' in response_data and len(response_data['choices']) > 0:
210
+ summary = response_data['choices'][0]['message']['content'].strip()
211
+ logging.debug("OpenAI: Summarization successful")
212
+ logging.debug(f"OpenAI: Summary (first 500 chars): {summary[:500]}...")
213
+ return summary
214
+ else:
215
+ logging.warning("OpenAI: Summary not found in the response data")
216
+ return "OpenAI: Summary not available"
217
+ else:
218
+ logging.error(f"OpenAI: Summarization failed with status code {response.status_code}")
219
+ logging.error(f"OpenAI: Error response: {response.text}")
220
+ return f"OpenAI: Failed to process summary. Status code: {response.status_code}"
221
+ except json.JSONDecodeError as e:
222
+ logging.error(f"OpenAI: Error decoding JSON: {str(e)}", exc_info=True)
223
+ return f"OpenAI: Error decoding JSON input: {str(e)}"
224
+ except requests.RequestException as e:
225
+ logging.error(f"OpenAI: Error making API request: {str(e)}", exc_info=True)
226
+ return f"OpenAI: Error making API request: {str(e)}"
227
+ except Exception as e:
228
+ logging.error(f"OpenAI: Unexpected error: {str(e)}", exc_info=True)
229
+ return f"OpenAI: Unexpected error occurred: {str(e)}"
230
+
231
+
232
+ def summarize_with_anthropic(api_key, input_data, custom_prompt_arg, temp=None, system_message=None, max_retries=3, retry_delay=5):
233
+ logging.debug("Anthropic: Summarization process starting...")
234
+ try:
235
+ logging.debug("Anthropic: Loading and validating configurations")
236
+ loaded_config_data = load_and_log_configs()
237
+ if loaded_config_data is None:
238
+ logging.error("Failed to load configuration data")
239
+ anthropic_api_key = None
240
+ else:
241
+ # Prioritize the API key passed as a parameter
242
+ if api_key and api_key.strip():
243
+ anthropic_api_key = api_key
244
+ logging.info("Anthropic: Using API key provided as parameter")
245
+ else:
246
+ # If no parameter is provided, use the key from the config
247
+ anthropic_api_key = loaded_config_data['api_keys'].get('anthropic')
248
+ if anthropic_api_key:
249
+ logging.info("Anthropic: Using API key from config file")
250
+ else:
251
+ logging.warning("Anthropic: No API key found in config file")
252
+
253
+ # Final check to ensure we have a valid API key
254
+ if not anthropic_api_key or not anthropic_api_key.strip():
255
+ logging.error("Anthropic: No valid API key available")
256
+ # You might want to raise an exception here or handle this case as appropriate for your application
257
+ #FIXME
258
+ # For example: raise ValueError("No valid Anthropic API key available")
259
+
260
+
261
+ logging.debug(f"Anthropic: Using API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:]}")
262
+
263
+ if isinstance(input_data, str) and os.path.isfile(input_data):
264
+ logging.debug("AnthropicAI: Loading json data for summarization")
265
+ with open(input_data, 'r') as file:
266
+ data = json.load(file)
267
+ else:
268
+ logging.debug("AnthropicAI: Using provided string data for summarization")
269
+ data = input_data
270
+
271
+ # DEBUG - Debug logging to identify sent data
272
+ logging.debug(f"AnthropicAI: Loaded data: {data[:500]}...(snipped to first 500 chars)")
273
+ logging.debug(f"AnthropicAI: Type of data: {type(data)}")
274
+
275
+ if isinstance(data, dict) and 'summary' in data:
276
+ # If the loaded data is a dictionary and already contains a summary, return it
277
+ logging.debug("Anthropic: Summary already exists in the loaded data")
278
+ return data['summary']
279
+
280
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
281
+ if isinstance(data, list):
282
+ segments = data
283
+ text = extract_text_from_segments(segments)
284
+ elif isinstance(data, str):
285
+ text = data
286
+ else:
287
+ raise ValueError("Anthropic: Invalid input data format")
288
+
289
+ if temp is None:
290
+ temp = 0.1
291
+ temp = float(temp)
292
+
293
+ if system_message is None:
294
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
295
+
296
+ headers = {
297
+ 'x-api-key': anthropic_api_key,
298
+ 'anthropic-version': '2023-06-01',
299
+ 'Content-Type': 'application/json'
300
+ }
301
+
302
+ anthropic_prompt = custom_prompt_arg
303
+ logging.debug(f"Anthropic: Prompt is {anthropic_prompt}")
304
+ user_message = {
305
+ "role": "user",
306
+ "content": f"{text} \n\n\n\n{anthropic_prompt}"
307
+ }
308
+
309
+ model = loaded_config_data['models']['anthropic']
310
+
311
+ data = {
312
+ "model": model,
313
+ "max_tokens": 4096, # max _possible_ tokens to return
314
+ "messages": [user_message],
315
+ "stop_sequences": ["\n\nHuman:"],
316
+ "temperature": temp,
317
+ "top_k": 0,
318
+ "top_p": 1.0,
319
+ "metadata": {
320
+ "user_id": "example_user_id",
321
+ },
322
+ "stream": False,
323
+ "system": system_message
324
+ }
325
+
326
+ for attempt in range(max_retries):
327
+ try:
328
+ logging.debug("anthropic: Posting request to API")
329
+ response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
330
+
331
+ # Check if the status code indicates success
332
+ if response.status_code == 200:
333
+ logging.debug("anthropic: Post submittal successful")
334
+ response_data = response.json()
335
+ try:
336
+ summary = response_data['content'][0]['text'].strip()
337
+ logging.debug("anthropic: Summarization successful")
338
+ print("Summary processed successfully.")
339
+ return summary
340
+ except (IndexError, KeyError) as e:
341
+ logging.debug("anthropic: Unexpected data in response")
342
+ print("Unexpected response format from Anthropic API:", response.text)
343
+ return None
344
+ elif response.status_code == 500: # Handle internal server error specifically
345
+ logging.debug("anthropic: Internal server error")
346
+ print("Internal server error from API. Retrying may be necessary.")
347
+ time.sleep(retry_delay)
348
+ else:
349
+ logging.debug(
350
+ f"anthropic: Failed to summarize, status code {response.status_code}: {response.text}")
351
+ print(f"Failed to process summary, status code {response.status_code}: {response.text}")
352
+ return None
353
+
354
+ except RequestException as e:
355
+ logging.error(f"anthropic: Network error during attempt {attempt + 1}/{max_retries}: {str(e)}")
356
+ if attempt < max_retries - 1:
357
+ time.sleep(retry_delay)
358
+ else:
359
+ return f"anthropic: Network error: {str(e)}"
360
+ except FileNotFoundError as e:
361
+ logging.error(f"anthropic: File not found: {input_data}")
362
+ return f"anthropic: File not found: {input_data}"
363
+ except json.JSONDecodeError as e:
364
+ logging.error(f"anthropic: Invalid JSON format in file: {input_data}")
365
+ return f"anthropic: Invalid JSON format in file: {input_data}"
366
+ except Exception as e:
367
+ logging.error(f"anthropic: Error in processing: {str(e)}")
368
+ return f"anthropic: Error occurred while processing summary with Anthropic: {str(e)}"
369
+
370
+
371
+ # Summarize with Cohere
372
+ def summarize_with_cohere(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
373
+ logging.debug("Cohere: Summarization process starting...")
374
+ try:
375
+ logging.debug("Cohere: Loading and validating configurations")
376
+ loaded_config_data = load_and_log_configs()
377
+ if loaded_config_data is None:
378
+ logging.error("Failed to load configuration data")
379
+ cohere_api_key = None
380
+ else:
381
+ # Prioritize the API key passed as a parameter
382
+ if api_key and api_key.strip():
383
+ cohere_api_key = api_key
384
+ logging.info("Cohere: Using API key provided as parameter")
385
+ else:
386
+ # If no parameter is provided, use the key from the config
387
+ cohere_api_key = loaded_config_data['api_keys'].get('cohere')
388
+ if cohere_api_key:
389
+ logging.info("Cohere: Using API key from config file")
390
+ else:
391
+ logging.warning("Cohere: No API key found in config file")
392
+
393
+ # Final check to ensure we have a valid API key
394
+ if not cohere_api_key or not cohere_api_key.strip():
395
+ logging.error("Cohere: No valid API key available")
396
+ # You might want to raise an exception here or handle this case as appropriate for your application
397
+ # FIXME
398
+ # For example: raise ValueError("No valid Anthropic API key available")
399
+
400
+ if custom_prompt_arg is None:
401
+ custom_prompt_arg = ""
402
+
403
+ if system_message is None:
404
+ system_message = ""
405
+
406
+ logging.debug(f"Cohere: Using API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:]}")
407
+
408
+ if isinstance(input_data, str) and os.path.isfile(input_data):
409
+ logging.debug("Cohere: Loading json data for summarization")
410
+ with open(input_data, 'r') as file:
411
+ data = json.load(file)
412
+ else:
413
+ logging.debug("Cohere: Using provided string data for summarization")
414
+ data = input_data
415
+
416
+ # DEBUG - Debug logging to identify sent data
417
+ logging.debug(f"Cohere: Loaded data: {data[:500]}...(snipped to first 500 chars)")
418
+ logging.debug(f"Cohere: Type of data: {type(data)}")
419
+
420
+ if isinstance(data, dict) and 'summary' in data:
421
+ # If the loaded data is a dictionary and already contains a summary, return it
422
+ logging.debug("Cohere: Summary already exists in the loaded data")
423
+ return data['summary']
424
+
425
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
426
+ if isinstance(data, list):
427
+ segments = data
428
+ text = extract_text_from_segments(segments)
429
+ elif isinstance(data, str):
430
+ text = data
431
+ else:
432
+ raise ValueError("Invalid input data format")
433
+
434
+ cohere_model = loaded_config_data['models']['cohere']
435
+
436
+ if temp is None:
437
+ temp = 0.3
438
+ temp = float(temp)
439
+ if system_message is None:
440
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
441
+
442
+ headers = {
443
+ 'accept': 'application/json',
444
+ 'content-type': 'application/json',
445
+ 'Authorization': f'Bearer {cohere_api_key}'
446
+ }
447
+
448
+ cohere_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
449
+ logging.debug(f"cohere: Prompt being sent is {cohere_prompt}")
450
+
451
+ data = {
452
+ "preamble": system_message,
453
+ "message": cohere_prompt,
454
+ "model": cohere_model,
455
+ # "connectors": [{"id": "web-search"}],
456
+ "temperature": temp
457
+ }
458
+
459
+ logging.debug("cohere: Submitting request to API endpoint")
460
+ response = requests.post('https://api.cohere.ai/v1/chat', headers=headers, json=data)
461
+ response_data = response.json()
462
+ logging.debug("API Response Data: %s", response_data)
463
+
464
+ if response.status_code == 200:
465
+ if 'text' in response_data:
466
+ summary = response_data['text'].strip()
467
+ logging.debug("cohere: Summarization successful")
468
+ print("Summary processed successfully.")
469
+ return summary
470
+ else:
471
+ logging.error("Expected data not found in API response.")
472
+ return "Expected data not found in API response."
473
+ else:
474
+ logging.error(f"cohere: API request failed with status code {response.status_code}: {response.text}")
475
+ print(f"Failed to process summary, status code {response.status_code}: {response.text}")
476
+ return f"cohere: API request failed: {response.text}"
477
+
478
+ except Exception as e:
479
+ logging.error("cohere: Error in processing: %s", str(e))
480
+ return f"cohere: Error occurred while processing summary with Cohere: {str(e)}"
481
+
482
+
483
+ # https://console.groq.com/docs/quickstart
484
+ def summarize_with_groq(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
485
+ logging.debug("Groq: Summarization process starting...")
486
+ try:
487
+ logging.debug("Groq: Loading and validating configurations")
488
+ loaded_config_data = load_and_log_configs()
489
+ if loaded_config_data is None:
490
+ logging.error("Failed to load configuration data")
491
+ groq_api_key = None
492
+ else:
493
+ # Prioritize the API key passed as a parameter
494
+ if api_key and api_key.strip():
495
+ groq_api_key = api_key
496
+ logging.info("Groq: Using API key provided as parameter")
497
+ else:
498
+ # If no parameter is provided, use the key from the config
499
+ groq_api_key = loaded_config_data['api_keys'].get('groq')
500
+ if groq_api_key:
501
+ logging.info("Groq: Using API key from config file")
502
+ else:
503
+ logging.warning("Groq: No API key found in config file")
504
+
505
+ # Final check to ensure we have a valid API key
506
+ if not groq_api_key or not groq_api_key.strip():
507
+ logging.error("Anthropic: No valid API key available")
508
+ # You might want to raise an exception here or handle this case as appropriate for your application
509
+ # FIXME
510
+ # For example: raise ValueError("No valid Anthropic API key available")
511
+
512
+ logging.debug(f"Groq: Using API Key: {groq_api_key[:5]}...{groq_api_key[-5:]}")
513
+
514
+ # Transcript data handling & Validation
515
+ if isinstance(input_data, str) and os.path.isfile(input_data):
516
+ logging.debug("Groq: Loading json data for summarization")
517
+ with open(input_data, 'r') as file:
518
+ data = json.load(file)
519
+ else:
520
+ logging.debug("Groq: Using provided string data for summarization")
521
+ data = input_data
522
+
523
+ # DEBUG - Debug logging to identify sent data
524
+ logging.debug(f"Groq: Loaded data: {data[:500]}...(snipped to first 500 chars)")
525
+ logging.debug(f"Groq: Type of data: {type(data)}")
526
+
527
+ if isinstance(data, dict) and 'summary' in data:
528
+ # If the loaded data is a dictionary and already contains a summary, return it
529
+ logging.debug("Groq: Summary already exists in the loaded data")
530
+ return data['summary']
531
+
532
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
533
+ if isinstance(data, list):
534
+ segments = data
535
+ text = extract_text_from_segments(segments)
536
+ elif isinstance(data, str):
537
+ text = data
538
+ else:
539
+ raise ValueError("Groq: Invalid input data format")
540
+
541
+ # Set the model to be used
542
+ groq_model = loaded_config_data['models']['groq']
543
+
544
+ if temp is None:
545
+ temp = 0.2
546
+ temp = float(temp)
547
+ if system_message is None:
548
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
549
+
550
+ headers = {
551
+ 'Authorization': f'Bearer {groq_api_key}',
552
+ 'Content-Type': 'application/json'
553
+ }
554
+
555
+ groq_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
556
+ logging.debug("groq: Prompt being sent is {groq_prompt}")
557
+
558
+ data = {
559
+ "messages": [
560
+ {
561
+ "role": "system",
562
+ "content": system_message,
563
+ },
564
+ {
565
+ "role": "user",
566
+ "content": groq_prompt,
567
+ }
568
+ ],
569
+ "model": groq_model,
570
+ "temperature": temp
571
+ }
572
+
573
+ logging.debug("groq: Submitting request to API endpoint")
574
+ print("groq: Submitting request to API endpoint")
575
+ response = requests.post('https://api.groq.com/openai/v1/chat/completions', headers=headers, json=data)
576
+
577
+ response_data = response.json()
578
+ logging.debug("API Response Data: %s", response_data)
579
+
580
+ if response.status_code == 200:
581
+ if 'choices' in response_data and len(response_data['choices']) > 0:
582
+ summary = response_data['choices'][0]['message']['content'].strip()
583
+ logging.debug("groq: Summarization successful")
584
+ print("Summarization successful.")
585
+ return summary
586
+ else:
587
+ logging.error("Expected data not found in API response.")
588
+ return "Expected data not found in API response."
589
+ else:
590
+ logging.error(f"groq: API request failed with status code {response.status_code}: {response.text}")
591
+ return f"groq: API request failed: {response.text}"
592
+
593
+ except Exception as e:
594
+ logging.error("groq: Error in processing: %s", str(e))
595
+ return f"groq: Error occurred while processing summary with groq: {str(e)}"
596
+
597
+
598
+ def summarize_with_openrouter(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
599
+ import requests
600
+ import json
601
+ global openrouter_model, openrouter_api_key
602
+ try:
603
+ logging.debug("OpenRouter: Loading and validating configurations")
604
+ loaded_config_data = load_and_log_configs()
605
+ if loaded_config_data is None:
606
+ logging.error("Failed to load configuration data")
607
+ openrouter_api_key = None
608
+ else:
609
+ # Prioritize the API key passed as a parameter
610
+ if api_key and api_key.strip():
611
+ openrouter_api_key = api_key
612
+ logging.info("OpenRouter: Using API key provided as parameter")
613
+ else:
614
+ # If no parameter is provided, use the key from the config
615
+ openrouter_api_key = loaded_config_data['api_keys'].get('openrouter')
616
+ if openrouter_api_key:
617
+ logging.info("OpenRouter: Using API key from config file")
618
+ else:
619
+ logging.warning("OpenRouter: No API key found in config file")
620
+
621
+ # Model Selection validation
622
+ logging.debug("OpenRouter: Validating model selection")
623
+ loaded_config_data = load_and_log_configs()
624
+ openrouter_model = loaded_config_data['models']['openrouter']
625
+ logging.debug(f"OpenRouter: Using model from config file: {openrouter_model}")
626
+
627
+ # Final check to ensure we have a valid API key
628
+ if not openrouter_api_key or not openrouter_api_key.strip():
629
+ logging.error("OpenRouter: No valid API key available")
630
+ raise ValueError("No valid Anthropic API key available")
631
+ except Exception as e:
632
+ logging.error("OpenRouter: Error in processing: %s", str(e))
633
+ return f"OpenRouter: Error occurred while processing config file with OpenRouter: {str(e)}"
634
+
635
+ logging.debug(f"OpenRouter: Using API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:]}")
636
+
637
+ logging.debug(f"OpenRouter: Using Model: {openrouter_model}")
638
+
639
+ if isinstance(input_data, str) and os.path.isfile(input_data):
640
+ logging.debug("OpenRouter: Loading json data for summarization")
641
+ with open(input_data, 'r') as file:
642
+ data = json.load(file)
643
+ else:
644
+ logging.debug("OpenRouter: Using provided string data for summarization")
645
+ data = input_data
646
+
647
+ # DEBUG - Debug logging to identify sent data
648
+ logging.debug(f"OpenRouter: Loaded data: {data[:500]}...(snipped to first 500 chars)")
649
+ logging.debug(f"OpenRouter: Type of data: {type(data)}")
650
+
651
+ if isinstance(data, dict) and 'summary' in data:
652
+ # If the loaded data is a dictionary and already contains a summary, return it
653
+ logging.debug("OpenRouter: Summary already exists in the loaded data")
654
+ return data['summary']
655
+
656
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
657
+ if isinstance(data, list):
658
+ segments = data
659
+ text = extract_text_from_segments(segments)
660
+ elif isinstance(data, str):
661
+ text = data
662
+ else:
663
+ raise ValueError("OpenRouter: Invalid input data format")
664
+
665
+ openrouter_prompt = f"{input_data} \n\n\n\n{custom_prompt_arg}"
666
+
667
+ if temp is None:
668
+ temp = 0.1
669
+ temp = float(temp)
670
+ if system_message is None:
671
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
672
+
673
+ try:
674
+ logging.debug("OpenRouter: Submitting request to API endpoint")
675
+ print("OpenRouter: Submitting request to API endpoint")
676
+ response = requests.post(
677
+ url="https://openrouter.ai/api/v1/chat/completions",
678
+ headers={
679
+ "Authorization": f"Bearer {openrouter_api_key}",
680
+ },
681
+ data=json.dumps({
682
+ "model": openrouter_model,
683
+ "messages": [
684
+ {"role": "system", "content": system_message},
685
+ {"role": "user", "content": openrouter_prompt}
686
+ ],
687
+ "temperature": temp
688
+ })
689
+ )
690
+
691
+ response_data = response.json()
692
+ logging.debug("API Response Data: %s", response_data)
693
+
694
+ if response.status_code == 200:
695
+ if 'choices' in response_data and len(response_data['choices']) > 0:
696
+ summary = response_data['choices'][0]['message']['content'].strip()
697
+ logging.debug("openrouter: Summarization successful")
698
+ print("openrouter: Summarization successful.")
699
+ return summary
700
+ else:
701
+ logging.error("openrouter: Expected data not found in API response.")
702
+ return "openrouter: Expected data not found in API response."
703
+ else:
704
+ logging.error(f"openrouter: API request failed with status code {response.status_code}: {response.text}")
705
+ return f"openrouter: API request failed: {response.text}"
706
+ except Exception as e:
707
+ logging.error("openrouter: Error in processing: %s", str(e))
708
+ return f"openrouter: Error occurred while processing summary with openrouter: {str(e)}"
709
+
710
+
711
+ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg, temp=None):
712
+ loaded_config_data = load_and_log_configs()
713
+ global huggingface_api_key
714
+ logging.debug("HuggingFace: Summarization process starting...")
715
+ try:
716
+ logging.debug("HuggingFace: Loading and validating configurations")
717
+ loaded_config_data = load_and_log_configs()
718
+ if loaded_config_data is None:
719
+ logging.error("Failed to load configuration data")
720
+ huggingface_api_key = None
721
+ else:
722
+ # Prioritize the API key passed as a parameter
723
+ if api_key and api_key.strip():
724
+ huggingface_api_key = api_key
725
+ logging.info("HuggingFace: Using API key provided as parameter")
726
+ else:
727
+ # If no parameter is provided, use the key from the config
728
+ huggingface_api_key = loaded_config_data['api_keys'].get('huggingface')
729
+ if huggingface_api_key:
730
+ logging.info("HuggingFace: Using API key from config file")
731
+ else:
732
+ logging.warning("HuggingFace: No API key found in config file")
733
+
734
+ # Final check to ensure we have a valid API key
735
+ if not huggingface_api_key or not huggingface_api_key.strip():
736
+ logging.error("HuggingFace: No valid API key available")
737
+ # You might want to raise an exception here or handle this case as appropriate for your application
738
+ # FIXME
739
+ # For example: raise ValueError("No valid Anthropic API key available")
740
+
741
+
742
+ logging.debug(f"HuggingFace: Using API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:]}")
743
+
744
+ if isinstance(input_data, str) and os.path.isfile(input_data):
745
+ logging.debug("HuggingFace: Loading json data for summarization")
746
+ with open(input_data, 'r') as file:
747
+ data = json.load(file)
748
+ else:
749
+ logging.debug("HuggingFace: Using provided string data for summarization")
750
+ data = input_data
751
+
752
+ # DEBUG - Debug logging to identify sent data
753
+ logging.debug(f"HuggingFace: Loaded data: {data[:500]}...(snipped to first 500 chars)")
754
+ logging.debug(f"HuggingFace: Type of data: {type(data)}")
755
+
756
+ if isinstance(data, dict) and 'summary' in data:
757
+ # If the loaded data is a dictionary and already contains a summary, return it
758
+ logging.debug("HuggingFace: Summary already exists in the loaded data")
759
+ return data['summary']
760
+
761
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
762
+ if isinstance(data, list):
763
+ segments = data
764
+ text = extract_text_from_segments(segments)
765
+ elif isinstance(data, str):
766
+ text = data
767
+ else:
768
+ raise ValueError("HuggingFace: Invalid input data format")
769
+
770
+ headers = {
771
+ "Authorization": f"Bearer {huggingface_api_key}"
772
+ }
773
+ huggingface_model = loaded_config_data['models']['huggingface']
774
+ API_URL = f"https://api-inference.huggingface.co/models/{huggingface_model}"
775
+ if temp is None:
776
+ temp = 0.1
777
+ temp = float(temp)
778
+ huggingface_prompt = f"{text}\n\n\n\n{custom_prompt_arg}"
779
+ logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
780
+ data = {
781
+ "inputs": text,
782
+ "parameters": {"max_length": 512, "min_length": 100} # You can adjust max_length and min_length as needed
783
+ }
784
+
785
+ logging.debug("huggingface: Submitting request...")
786
+ response = requests.post(API_URL, headers=headers, json=data)
787
+
788
+ if response.status_code == 200:
789
+ summary = response.json()[0]['generated_text'].strip()
790
+ logging.debug("huggingface: Summarization successful")
791
+ print("Summarization successful.")
792
+ return summary
793
+ else:
794
+ logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
795
+ return f"Failed to process summary, status code {response.status_code}: {response.text}"
796
+
797
+ except Exception as e:
798
+ logging.error("huggingface: Error in processing: %s", str(e))
799
+ print(f"Error occurred while processing summary with huggingface: {str(e)}")
800
+ return None
801
+
802
+
803
+ def summarize_with_deepseek(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
804
+ logging.debug("DeepSeek: Summarization process starting...")
805
+ try:
806
+ logging.debug("DeepSeek: Loading and validating configurations")
807
+ loaded_config_data = load_and_log_configs()
808
+ if loaded_config_data is None:
809
+ logging.error("Failed to load configuration data")
810
+ deepseek_api_key = None
811
+ else:
812
+ # Prioritize the API key passed as a parameter
813
+ if api_key and api_key.strip():
814
+ deepseek_api_key = api_key
815
+ logging.info("DeepSeek: Using API key provided as parameter")
816
+ else:
817
+ # If no parameter is provided, use the key from the config
818
+ deepseek_api_key = loaded_config_data['api_keys'].get('deepseek')
819
+ if deepseek_api_key:
820
+ logging.info("DeepSeek: Using API key from config file")
821
+ else:
822
+ logging.warning("DeepSeek: No API key found in config file")
823
+
824
+ # Final check to ensure we have a valid API key
825
+ if not deepseek_api_key or not deepseek_api_key.strip():
826
+ logging.error("DeepSeek: No valid API key available")
827
+ # You might want to raise an exception here or handle this case as appropriate for your application
828
+ # FIXME
829
+ # For example: raise ValueError("No valid deepseek API key available")
830
+
831
+
832
+ logging.debug(f"DeepSeek: Using API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:]}")
833
+
834
+ # Input data handling
835
+ if isinstance(input_data, str) and os.path.isfile(input_data):
836
+ logging.debug("DeepSeek: Loading json data for summarization")
837
+ with open(input_data, 'r') as file:
838
+ data = json.load(file)
839
+ else:
840
+ logging.debug("DeepSeek: Using provided string data for summarization")
841
+ data = input_data
842
+
843
+ # DEBUG - Debug logging to identify sent data
844
+ logging.debug(f"DeepSeek: Loaded data: {data[:500]}...(snipped to first 500 chars)")
845
+ logging.debug(f"DeepSeek: Type of data: {type(data)}")
846
+
847
+ if isinstance(data, dict) and 'summary' in data:
848
+ # If the loaded data is a dictionary and already contains a summary, return it
849
+ logging.debug("DeepSeek: Summary already exists in the loaded data")
850
+ return data['summary']
851
+
852
+ # Text extraction
853
+ if isinstance(data, list):
854
+ segments = data
855
+ text = extract_text_from_segments(segments)
856
+ elif isinstance(data, str):
857
+ text = data
858
+ else:
859
+ raise ValueError("DeepSeek: Invalid input data format")
860
+
861
+ deepseek_model = loaded_config_data['models']['deepseek'] or "deepseek-chat"
862
+
863
+ if temp is None:
864
+ temp = 0.1
865
+ temp = float(temp)
866
+ if system_message is None:
867
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
868
+
869
+ headers = {
870
+ 'Authorization': f'Bearer {api_key}',
871
+ 'Content-Type': 'application/json'
872
+ }
873
+
874
+ logging.debug(
875
+ f"Deepseek API Key: {api_key[:5]}...{api_key[-5:] if api_key else None}")
876
+ logging.debug("openai: Preparing data + prompt for submittal")
877
+ deepseek_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
878
+ data = {
879
+ "model": deepseek_model,
880
+ "messages": [
881
+ {"role": "system", "content": system_message},
882
+ {"role": "user", "content": deepseek_prompt}
883
+ ],
884
+ "stream": False,
885
+ "temperature": temp
886
+ }
887
+
888
+ logging.debug("DeepSeek: Posting request")
889
+ response = requests.post('https://api.deepseek.com/chat/completions', headers=headers, json=data)
890
+
891
+ if response.status_code == 200:
892
+ response_data = response.json()
893
+ if 'choices' in response_data and len(response_data['choices']) > 0:
894
+ summary = response_data['choices'][0]['message']['content'].strip()
895
+ logging.debug("DeepSeek: Summarization successful")
896
+ return summary
897
+ else:
898
+ logging.warning("DeepSeek: Summary not found in the response data")
899
+ return "DeepSeek: Summary not available"
900
+ else:
901
+ logging.error(f"DeepSeek: Summarization failed with status code {response.status_code}")
902
+ logging.error(f"DeepSeek: Error response: {response.text}")
903
+ return f"DeepSeek: Failed to process summary. Status code: {response.status_code}"
904
+ except Exception as e:
905
+ logging.error(f"DeepSeek: Error in processing: {str(e)}", exc_info=True)
906
+ return f"DeepSeek: Error occurred while processing summary: {str(e)}"
907
+
908
+
909
+ def summarize_with_mistral(api_key, input_data, custom_prompt_arg, temp=None, system_message=None):
910
+ logging.debug("Mistral: Summarization process starting...")
911
+ try:
912
+ logging.debug("Mistral: Loading and validating configurations")
913
+ loaded_config_data = load_and_log_configs()
914
+ if loaded_config_data is None:
915
+ logging.error("Failed to load configuration data")
916
+ mistral_api_key = None
917
+ else:
918
+ # Prioritize the API key passed as a parameter
919
+ if api_key and api_key.strip():
920
+ mistral_api_key = api_key
921
+ logging.info("Mistral: Using API key provided as parameter")
922
+ else:
923
+ # If no parameter is provided, use the key from the config
924
+ mistral_api_key = loaded_config_data['api_keys'].get('mistral')
925
+ if mistral_api_key:
926
+ logging.info("Mistral: Using API key from config file")
927
+ else:
928
+ logging.warning("Mistral: No API key found in config file")
929
+
930
+ # Final check to ensure we have a valid API key
931
+ if not mistral_api_key or not mistral_api_key.strip():
932
+ logging.error("Mistral: No valid API key available")
933
+ # You might want to raise an exception here or handle this case as appropriate for your application
934
+ # FIXME
935
+ # For example: raise ValueError("No valid deepseek API key available")
936
+
937
+
938
+ logging.debug(f"Mistral: Using API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:]}")
939
+
940
+ # Input data handling
941
+ if isinstance(input_data, str) and os.path.isfile(input_data):
942
+ logging.debug("Mistral: Loading json data for summarization")
943
+ with open(input_data, 'r') as file:
944
+ data = json.load(file)
945
+ else:
946
+ logging.debug("Mistral: Using provided string data for summarization")
947
+ data = input_data
948
+
949
+ # DEBUG - Debug logging to identify sent data
950
+ logging.debug(f"Mistral: Loaded data: {data[:500]}...(snipped to first 500 chars)")
951
+ logging.debug(f"Mistral: Type of data: {type(data)}")
952
+
953
+ if isinstance(data, dict) and 'summary' in data:
954
+ # If the loaded data is a dictionary and already contains a summary, return it
955
+ logging.debug("Mistral: Summary already exists in the loaded data")
956
+ return data['summary']
957
+
958
+ # Text extraction
959
+ if isinstance(data, list):
960
+ segments = data
961
+ text = extract_text_from_segments(segments)
962
+ elif isinstance(data, str):
963
+ text = data
964
+ else:
965
+ raise ValueError("Mistral: Invalid input data format")
966
+
967
+ mistral_model = loaded_config_data['models']['mistral'] or "mistral-large-latest"
968
+
969
+ if temp is None:
970
+ temp = 0.2
971
+ temp = float(temp)
972
+ if system_message is None:
973
+ system_message = "You are a helpful AI assistant who does whatever the user requests."
974
+
975
+ headers = {
976
+ 'Authorization': f'Bearer {mistral_api_key}',
977
+ 'Content-Type': 'application/json'
978
+ }
979
+
980
+ logging.debug(
981
+ f"Deepseek API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
982
+ logging.debug("Mistral: Preparing data + prompt for submittal")
983
+ mistral_prompt = f"{custom_prompt_arg}\n\n\n\n{text} "
984
+ data = {
985
+ "model": mistral_model,
986
+ "messages": [
987
+ {"role": "system",
988
+ "content": system_message},
989
+ {"role": "user",
990
+ "content": mistral_prompt}
991
+ ],
992
+ "temperature": temp,
993
+ "top_p": 1,
994
+ "max_tokens": 4096,
995
+ "stream": "false",
996
+ "safe_prompt": "false"
997
+ }
998
+
999
+ logging.debug("Mistral: Posting request")
1000
+ response = requests.post('https://api.mistral.ai/v1/chat/completions', headers=headers, json=data)
1001
+
1002
+ if response.status_code == 200:
1003
+ response_data = response.json()
1004
+ if 'choices' in response_data and len(response_data['choices']) > 0:
1005
+ summary = response_data['choices'][0]['message']['content'].strip()
1006
+ logging.debug("Mistral: Summarization successful")
1007
+ return summary
1008
+ else:
1009
+ logging.warning("Mistral: Summary not found in the response data")
1010
+ return "Mistral: Summary not available"
1011
+ else:
1012
+ logging.error(f"Mistral: Summarization failed with status code {response.status_code}")
1013
+ logging.error(f"Mistral: Error response: {response.text}")
1014
+ return f"Mistral: Failed to process summary. Status code: {response.status_code}"
1015
+ except Exception as e:
1016
+ logging.error(f"Mistral: Error in processing: {str(e)}", exc_info=True)
1017
+ return f"Mistral: Error occurred while processing summary: {str(e)}"
1018
+
1019
+ #
1020
+ #
1021
+ #######################################################################################################################
1022
+ #
1023
+ #
1024
+ # Gradio File Processing
1025
+
1026
+
1027
+ # Handle multiple videos as input
1028
+ def process_video_urls(url_list, num_speakers, whisper_model, custom_prompt_input, offset, api_name, api_key, vad_filter,
1029
+ download_video_flag, download_audio, rolling_summarization, detail_level, question_box,
1030
+ keywords, chunk_text_by_words, max_words, chunk_text_by_sentences, max_sentences,
1031
+ chunk_text_by_paragraphs, max_paragraphs, chunk_text_by_tokens, max_tokens, chunk_by_semantic,
1032
+ semantic_chunk_size, semantic_chunk_overlap, recursive_summarization):
1033
+ global current_progress
1034
+ progress = [] # This must always be a list
1035
+ status = [] # This must always be a list
1036
+
1037
+ if custom_prompt_input is None:
1038
+ custom_prompt_input = """
1039
+ You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
1040
+ **Bulleted Note Creation Guidelines**
1041
+
1042
+ **Headings**:
1043
+ - Based on referenced topics, not categories like quotes or terms
1044
+ - Surrounded by **bold** formatting
1045
+ - Not listed as bullet points
1046
+ - No space between headings and list items underneath
1047
+
1048
+ **Emphasis**:
1049
+ - **Important terms** set in bold font
1050
+ - **Text ending in a colon**: also bolded
1051
+
1052
+ **Review**:
1053
+ - Ensure adherence to specified format
1054
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
1055
+
1056
+ def update_progress(index, url, message):
1057
+ progress.append(f"Processing {index + 1}/{len(url_list)}: {url}") # Append to list
1058
+ status.append(message) # Append to list
1059
+ return "\n".join(progress), "\n".join(status) # Return strings for display
1060
+
1061
+
1062
+ for index, url in enumerate(url_list):
1063
+ try:
1064
+ logging.info(f"Starting to process video {index + 1}/{len(url_list)}: {url}")
1065
+ transcription, summary, json_file_path, summary_file_path, _, _ = process_url(url=url,
1066
+ num_speakers=num_speakers,
1067
+ whisper_model=whisper_model,
1068
+ custom_prompt_input=custom_prompt_input,
1069
+ offset=offset,
1070
+ api_name=api_name,
1071
+ api_key=api_key,
1072
+ vad_filter=vad_filter,
1073
+ download_video_flag=download_video_flag,
1074
+ download_audio=download_audio,
1075
+ rolling_summarization=rolling_summarization,
1076
+ detail_level=detail_level,
1077
+ question_box=question_box,
1078
+ keywords=keywords,
1079
+ chunk_text_by_words=chunk_text_by_words,
1080
+ max_words=max_words,
1081
+ chunk_text_by_sentences=chunk_text_by_sentences,
1082
+ max_sentences=max_sentences,
1083
+ chunk_text_by_paragraphs=chunk_text_by_paragraphs,
1084
+ max_paragraphs=max_paragraphs,
1085
+ chunk_text_by_tokens=chunk_text_by_tokens,
1086
+ max_tokens=max_tokens,
1087
+ chunk_by_semantic=chunk_by_semantic,
1088
+ semantic_chunk_size=semantic_chunk_size,
1089
+ semantic_chunk_overlap=semantic_chunk_overlap,
1090
+ recursive_summarization=recursive_summarization)
1091
+ # Update progress and transcription properly
1092
+
1093
+ current_progress, current_status = update_progress(index, url, "Video processed and ingested into the database.")
1094
+ logging.info(f"Successfully processed video {index + 1}/{len(url_list)}: {url}")
1095
+
1096
+ time.sleep(1)
1097
+ except Exception as e:
1098
+ logging.error(f"Error processing video {index + 1}/{len(url_list)}: {url}")
1099
+ logging.error(f"Error details: {str(e)}")
1100
+ current_progress, current_status = update_progress(index, url, f"Error: {str(e)}")
1101
+
1102
+ yield current_progress, current_status, None, None, None, None
1103
+
1104
+ success_message = "All videos have been transcribed, summarized, and ingested into the database successfully."
1105
+ return current_progress, success_message, None, None, None, None
1106
+
1107
+
1108
+
1109
+ def perform_transcription(video_path, offset, whisper_model, vad_filter, diarize=False):
1110
+ global segments_json_path
1111
+ audio_file_path = convert_to_wav(video_path, offset)
1112
+ segments_json_path = audio_file_path.replace('.wav', '.segments.json')
1113
+
1114
+ if diarize:
1115
+ diarized_json_path = audio_file_path.replace('.wav', '.diarized.json')
1116
+
1117
+ # Check if diarized JSON already exists
1118
+ if os.path.exists(diarized_json_path):
1119
+ logging.info(f"Diarized file already exists: {diarized_json_path}")
1120
+ try:
1121
+ with open(diarized_json_path, 'r') as file:
1122
+ diarized_segments = json.load(file)
1123
+ if not diarized_segments:
1124
+ logging.warning(f"Diarized JSON file is empty, re-generating: {diarized_json_path}")
1125
+ raise ValueError("Empty diarized JSON file")
1126
+ logging.debug(f"Loaded diarized segments from {diarized_json_path}")
1127
+ return audio_file_path, diarized_segments
1128
+ except (json.JSONDecodeError, ValueError) as e:
1129
+ logging.error(f"Failed to read or parse the diarized JSON file: {e}")
1130
+ os.remove(diarized_json_path)
1131
+
1132
+ # If diarized file doesn't exist or was corrupted, generate new diarized transcription
1133
+ logging.info(f"Generating diarized transcription for {audio_file_path}")
1134
+ diarized_segments = combine_transcription_and_diarization(audio_file_path)
1135
+
1136
+ # Save diarized segments
1137
+ with open(diarized_json_path, 'w') as file:
1138
+ json.dump(diarized_segments, file, indent=2)
1139
+
1140
+ return audio_file_path, diarized_segments
1141
+
1142
+ # Non-diarized transcription (existing functionality)
1143
+ if os.path.exists(segments_json_path):
1144
+ logging.info(f"Segments file already exists: {segments_json_path}")
1145
+ try:
1146
+ with open(segments_json_path, 'r') as file:
1147
+ segments = json.load(file)
1148
+ if not segments:
1149
+ logging.warning(f"Segments JSON file is empty, re-generating: {segments_json_path}")
1150
+ raise ValueError("Empty segments JSON file")
1151
+ logging.debug(f"Loaded segments from {segments_json_path}")
1152
+ except (json.JSONDecodeError, ValueError) as e:
1153
+ logging.error(f"Failed to read or parse the segments JSON file: {e}")
1154
+ os.remove(segments_json_path)
1155
+ logging.info(f"Re-generating transcription for {audio_file_path}")
1156
+ audio_file, segments = re_generate_transcription(audio_file_path, whisper_model, vad_filter)
1157
+ if segments is None:
1158
+ return None, None
1159
+ else:
1160
+ audio_file, segments = re_generate_transcription(audio_file_path, whisper_model, vad_filter)
1161
+
1162
+ return audio_file_path, segments
1163
+
1164
+
1165
+ def re_generate_transcription(audio_file_path, whisper_model, vad_filter):
1166
+ try:
1167
+ segments = speech_to_text(audio_file_path, whisper_model=whisper_model, vad_filter=vad_filter)
1168
+ # Save segments to JSON
1169
+ with open(segments_json_path, 'w') as file:
1170
+ json.dump(segments, file, indent=2)
1171
+ logging.debug(f"Transcription segments saved to {segments_json_path}")
1172
+ return audio_file_path, segments
1173
+ except Exception as e:
1174
+ logging.error(f"Error in re-generating transcription: {str(e)}")
1175
+ return None, None
1176
+
1177
+
1178
+ def save_transcription_and_summary(transcription_text, summary_text, download_path, info_dict):
1179
+ try:
1180
+ video_title = sanitize_filename(info_dict.get('title', 'Untitled'))
1181
+
1182
+ # Save transcription
1183
+ transcription_file_path = os.path.join(download_path, f"{video_title}_transcription.txt")
1184
+ with open(transcription_file_path, 'w', encoding='utf-8') as f:
1185
+ f.write(transcription_text)
1186
+
1187
+ # Save summary if available
1188
+ summary_file_path = None
1189
+ if summary_text:
1190
+ summary_file_path = os.path.join(download_path, f"{video_title}_summary.txt")
1191
+ with open(summary_file_path, 'w', encoding='utf-8') as f:
1192
+ f.write(summary_text)
1193
+
1194
+ return transcription_file_path, summary_file_path
1195
+ except Exception as e:
1196
+ logging.error(f"Error in save_transcription_and_summary: {str(e)}", exc_info=True)
1197
+ return None, None
1198
+
1199
+
1200
+ def summarize_chunk(api_name, text, custom_prompt_input, api_key, temp=None, system_message=None):
1201
+ logging.debug("Entered 'summarize_chunk' function")
1202
+ try:
1203
+ result = summarize(text, custom_prompt_input, api_name, api_key, temp, system_message)
1204
+ if result is None or result.startswith("Error:"):
1205
+ logging.warning(f"Summarization with {api_name} failed: {result}")
1206
+ return None
1207
+ logging.info(f"Summarization with {api_name} successful")
1208
+ return result
1209
+ except Exception as e:
1210
+ logging.error(f"Error in summarize_chunk with {api_name}: {str(e)}", exc_info=True)
1211
+ return None
1212
+
1213
+
1214
+ def extract_metadata_and_content(input_data):
1215
+ metadata = {}
1216
+ content = ""
1217
+
1218
+ if isinstance(input_data, str):
1219
+ if os.path.exists(input_data):
1220
+ with open(input_data, 'r', encoding='utf-8') as file:
1221
+ data = json.load(file)
1222
+ else:
1223
+ try:
1224
+ data = json.loads(input_data)
1225
+ except json.JSONDecodeError:
1226
+ return {}, input_data
1227
+ elif isinstance(input_data, dict):
1228
+ data = input_data
1229
+ else:
1230
+ return {}, str(input_data)
1231
+
1232
+ # Extract metadata
1233
+ metadata['title'] = data.get('title', 'No title available')
1234
+ metadata['author'] = data.get('author', 'Unknown author')
1235
+
1236
+ # Extract content
1237
+ if 'transcription' in data:
1238
+ content = extract_text_from_segments(data['transcription'])
1239
+ elif 'segments' in data:
1240
+ content = extract_text_from_segments(data['segments'])
1241
+ elif 'content' in data:
1242
+ content = data['content']
1243
+ else:
1244
+ content = json.dumps(data)
1245
+
1246
+ return metadata, content
1247
+
1248
+
1249
+ def format_input_with_metadata(metadata, content):
1250
+ formatted_input = f"Title: {metadata.get('title', 'No title available')}\n"
1251
+ formatted_input += f"Author: {metadata.get('author', 'Unknown author')}\n\n"
1252
+ formatted_input += content
1253
+ return formatted_input
1254
+
1255
+ def perform_summarization(api_name, input_data, custom_prompt_input, api_key, recursive_summarization=False, temp=None, system_message=None):
1256
+ loaded_config_data = load_and_log_configs()
1257
+ logging.info("Starting summarization process...")
1258
+ if system_message is None:
1259
+ system_message = """
1260
+ You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
1261
+ **Bulleted Note Creation Guidelines**
1262
+
1263
+ **Headings**:
1264
+ - Based on referenced topics, not categories like quotes or terms
1265
+ - Surrounded by **bold** formatting
1266
+ - Not listed as bullet points
1267
+ - No space between headings and list items underneath
1268
+
1269
+ **Emphasis**:
1270
+ - **Important terms** set in bold font
1271
+ - **Text ending in a colon**: also bolded
1272
+
1273
+ **Review**:
1274
+ - Ensure adherence to specified format
1275
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
1276
+
1277
+ try:
1278
+ logging.debug(f"Input data type: {type(input_data)}")
1279
+ logging.debug(f"Input data (first 500 chars): {str(input_data)[:500]}...")
1280
+
1281
+ # Extract metadata and content
1282
+ metadata, content = extract_metadata_and_content(input_data)
1283
+
1284
+ logging.debug(f"Extracted metadata: {metadata}")
1285
+ logging.debug(f"Extracted content (first 500 chars): {content[:500]}...")
1286
+
1287
+ # Prepare a structured input for summarization
1288
+ structured_input = format_input_with_metadata(metadata, content)
1289
+
1290
+ # Perform summarization on the structured input
1291
+ if recursive_summarization:
1292
+ chunk_options = {
1293
+ 'method': 'words', # or 'sentences', 'paragraphs', 'tokens' based on your preference
1294
+ 'max_size': 1000, # adjust as needed
1295
+ 'overlap': 100, # adjust as needed
1296
+ 'adaptive': False,
1297
+ 'multi_level': False,
1298
+ 'language': 'english'
1299
+ }
1300
+ chunks = improved_chunking_process(structured_input, chunk_options)
1301
+ logging.debug(f"Chunking process completed. Number of chunks: {len(chunks)}")
1302
+ logging.debug("Now performing recursive summarization on each chunk...")
1303
+ logging.debug("summary = recursive_summarize_chunks")
1304
+ summary = recursive_summarize_chunks([chunk['text'] for chunk in chunks],
1305
+ lambda x: summarize_chunk(api_name, x, custom_prompt_input, api_key),
1306
+ custom_prompt_input, temp, system_message)
1307
+ else:
1308
+ logging.debug("summary = summarize_chunk")
1309
+ summary = summarize_chunk(api_name, structured_input, custom_prompt_input, api_key, temp, system_message)
1310
+
1311
+ # add some actual validation logic
1312
+ if summary is not None:
1313
+ logging.info(f"Summary generated using {api_name} API")
1314
+ if isinstance(input_data, str) and os.path.exists(input_data):
1315
+ summary_file_path = input_data.replace('.json', '_summary.txt')
1316
+ with open(summary_file_path, 'w', encoding='utf-8') as file:
1317
+ file.write(summary)
1318
+ else:
1319
+ logging.warning(f"Failed to generate summary using {api_name} API")
1320
+
1321
+ logging.info("Summarization completed successfully.")
1322
+
1323
+ return summary
1324
+
1325
+ except requests.exceptions.ConnectionError:
1326
+ logging.error("Connection error while summarizing")
1327
+ except Exception as e:
1328
+ logging.error(f"Error summarizing with {api_name}: {str(e)}", exc_info=True)
1329
+ return f"An error occurred during summarization: {str(e)}"
1330
+ return None
1331
+
1332
+ def extract_text_from_input(input_data):
1333
+ if isinstance(input_data, str):
1334
+ try:
1335
+ # Try to parse as JSON
1336
+ data = json.loads(input_data)
1337
+ except json.JSONDecodeError:
1338
+ # If not valid JSON, treat as plain text
1339
+ return input_data
1340
+ elif isinstance(input_data, dict):
1341
+ data = input_data
1342
+ else:
1343
+ return str(input_data)
1344
+
1345
+ # Extract relevant fields from the JSON object
1346
+ text_parts = []
1347
+ if 'title' in data:
1348
+ text_parts.append(f"Title: {data['title']}")
1349
+ if 'description' in data:
1350
+ text_parts.append(f"Description: {data['description']}")
1351
+ if 'transcription' in data:
1352
+ if isinstance(data['transcription'], list):
1353
+ transcription_text = ' '.join([segment.get('Text', '') for segment in data['transcription']])
1354
+ elif isinstance(data['transcription'], str):
1355
+ transcription_text = data['transcription']
1356
+ else:
1357
+ transcription_text = str(data['transcription'])
1358
+ text_parts.append(f"Transcription: {transcription_text}")
1359
+ elif 'segments' in data:
1360
+ segments_text = extract_text_from_segments(data['segments'])
1361
+ text_parts.append(f"Segments: {segments_text}")
1362
+
1363
+ return '\n\n'.join(text_parts)
1364
+
1365
+
1366
+
1367
+ def process_url(
1368
+ url,
1369
+ num_speakers,
1370
+ whisper_model,
1371
+ custom_prompt_input,
1372
+ offset,
1373
+ api_name,
1374
+ api_key,
1375
+ vad_filter,
1376
+ download_video_flag,
1377
+ download_audio,
1378
+ rolling_summarization,
1379
+ detail_level,
1380
+ # It's for the asking a question about a returned prompt - needs to be removed #FIXME
1381
+ question_box,
1382
+ keywords,
1383
+ chunk_text_by_words,
1384
+ max_words,
1385
+ chunk_text_by_sentences,
1386
+ max_sentences,
1387
+ chunk_text_by_paragraphs,
1388
+ max_paragraphs,
1389
+ chunk_text_by_tokens,
1390
+ max_tokens,
1391
+ chunk_by_semantic,
1392
+ semantic_chunk_size,
1393
+ semantic_chunk_overlap,
1394
+ local_file_path=None,
1395
+ diarize=False,
1396
+ recursive_summarization=False,
1397
+ temp=None,
1398
+ system_message=None):
1399
+ # Handle the chunk summarization options
1400
+ set_chunk_txt_by_words = chunk_text_by_words
1401
+ set_max_txt_chunk_words = max_words
1402
+ set_chunk_txt_by_sentences = chunk_text_by_sentences
1403
+ set_max_txt_chunk_sentences = max_sentences
1404
+ set_chunk_txt_by_paragraphs = chunk_text_by_paragraphs
1405
+ set_max_txt_chunk_paragraphs = max_paragraphs
1406
+ set_chunk_txt_by_tokens = chunk_text_by_tokens
1407
+ set_max_txt_chunk_tokens = max_tokens
1408
+ set_chunk_txt_by_semantic = chunk_by_semantic
1409
+ set_semantic_chunk_size = semantic_chunk_size
1410
+ set_semantic_chunk_overlap = semantic_chunk_overlap
1411
+
1412
+ progress = []
1413
+ success_message = "All videos processed successfully. Transcriptions and summaries have been ingested into the database."
1414
+
1415
+ # Validate input
1416
+ if not url and not local_file_path:
1417
+ return "Process_URL: No URL provided.", "No URL provided.", None, None, None, None, None, None
1418
+
1419
+ if isinstance(url, str):
1420
+ urls = url.strip().split('\n')
1421
+ if len(urls) > 1:
1422
+ return process_video_urls(urls, num_speakers, whisper_model, custom_prompt_input, offset, api_name, api_key, vad_filter,
1423
+ download_video_flag, download_audio, rolling_summarization, detail_level, question_box,
1424
+ keywords, chunk_text_by_words, max_words, chunk_text_by_sentences, max_sentences,
1425
+ chunk_text_by_paragraphs, max_paragraphs, chunk_text_by_tokens, max_tokens, chunk_by_semantic, semantic_chunk_size, semantic_chunk_overlap, recursive_summarization)
1426
+ else:
1427
+ urls = [url]
1428
+
1429
+ if url and not is_valid_url(url):
1430
+ return "Process_URL: Invalid URL format.", "Invalid URL format.", None, None, None, None, None, None
1431
+
1432
+ if url:
1433
+ # Clean the URL to remove playlist parameters if any
1434
+ url = clean_youtube_url(url)
1435
+ logging.info(f"Process_URL: Processing URL: {url}")
1436
+
1437
+ if api_name:
1438
+ print("Process_URL: API Name received:", api_name) # Debugging line
1439
+
1440
+ video_file_path = None
1441
+ global info_dict
1442
+
1443
+ # If URL/Local video file is provided
1444
+ try:
1445
+ info_dict, title = extract_video_info(url)
1446
+ download_path = create_download_directory(title)
1447
+ current_whsiper_model = whisper_model
1448
+ video_path = download_video(url, download_path, info_dict, download_video_flag, current_whsiper_model)
1449
+ global segments
1450
+ audio_file_path, segments = perform_transcription(video_path, offset, whisper_model, vad_filter)
1451
+
1452
+ if diarize:
1453
+ transcription_text = combine_transcription_and_diarization(audio_file_path)
1454
+ else:
1455
+ audio_file, segments = perform_transcription(video_path, offset, whisper_model, vad_filter)
1456
+ transcription_text = {'audio_file': audio_file, 'transcription': segments}
1457
+
1458
+
1459
+ if audio_file_path is None or segments is None:
1460
+ logging.error("Process_URL: Transcription failed or segments not available.")
1461
+ return "Process_URL: Transcription failed.", "Transcription failed.", None, None, None, None
1462
+
1463
+ logging.debug(f"Process_URL: Transcription audio_file: {audio_file_path}")
1464
+ logging.debug(f"Process_URL: Transcription segments: {segments}")
1465
+
1466
+ logging.debug(f"Process_URL: Transcription text: {transcription_text}")
1467
+
1468
+ # FIXME - Implement chunking calls here
1469
+ # Implement chunking calls here
1470
+ chunked_transcriptions = []
1471
+ if chunk_text_by_words:
1472
+ chunked_transcriptions = chunk_text_by_words(transcription_text['transcription'], max_words)
1473
+ elif chunk_text_by_sentences:
1474
+ chunked_transcriptions = chunk_text_by_sentences(transcription_text['transcription'], max_sentences)
1475
+ elif chunk_text_by_paragraphs:
1476
+ chunked_transcriptions = chunk_text_by_paragraphs(transcription_text['transcription'], max_paragraphs)
1477
+ elif chunk_text_by_tokens:
1478
+ chunked_transcriptions = chunk_text_by_tokens(transcription_text['transcription'], max_tokens)
1479
+ elif chunk_by_semantic:
1480
+ chunked_transcriptions = semantic_chunking(transcription_text['transcription'], semantic_chunk_size, 'tokens')
1481
+
1482
+ # If we did chunking, we now have the chunked transcripts in 'chunked_transcriptions'
1483
+ elif rolling_summarization:
1484
+ # FIXME - rolling summarization
1485
+ # text = extract_text_from_segments(segments)
1486
+ # summary_text = rolling_summarize_function(
1487
+ # transcription_text,
1488
+ # detail=detail_level,
1489
+ # api_name=api_name,
1490
+ # api_key=api_key,
1491
+ # custom_prompt_input=custom_prompt_input,
1492
+ # chunk_by_words=chunk_text_by_words,
1493
+ # max_words=max_words,
1494
+ # chunk_by_sentences=chunk_text_by_sentences,
1495
+ # max_sentences=max_sentences,
1496
+ # chunk_by_paragraphs=chunk_text_by_paragraphs,
1497
+ # max_paragraphs=max_paragraphs,
1498
+ # chunk_by_tokens=chunk_text_by_tokens,
1499
+ # max_tokens=max_tokens
1500
+ # )
1501
+ pass
1502
+ else:
1503
+ pass
1504
+
1505
+ summarized_chunk_transcriptions = []
1506
+
1507
+ if chunk_text_by_words or chunk_text_by_sentences or chunk_text_by_paragraphs or chunk_text_by_tokens or chunk_by_semantic and api_name:
1508
+ # Perform summarization based on chunks
1509
+ for chunk in chunked_transcriptions:
1510
+ summarized_chunks = []
1511
+ if api_name == "anthropic":
1512
+ summary = summarize_with_anthropic(api_key, chunk, custom_prompt_input)
1513
+ elif api_name == "cohere":
1514
+ summary = summarize_with_cohere(api_key, chunk, custom_prompt_input, temp, system_message)
1515
+ elif api_name == "openai":
1516
+ summary = summarize_with_openai(api_key, chunk, custom_prompt_input, temp, system_message)
1517
+ elif api_name == "Groq":
1518
+ summary = summarize_with_groq(api_key, chunk, custom_prompt_input, temp, system_message)
1519
+ elif api_name == "DeepSeek":
1520
+ summary = summarize_with_deepseek(api_key, chunk, custom_prompt_input, temp, system_message)
1521
+ elif api_name == "OpenRouter":
1522
+ summary = summarize_with_openrouter(api_key, chunk, custom_prompt_input, temp, system_message)
1523
+ elif api_name == "Llama.cpp":
1524
+ summary = summarize_with_llama(chunk, custom_prompt_input, temp, system_message)
1525
+ elif api_name == "Kobold":
1526
+ summary = summarize_with_kobold(chunk, custom_prompt_input, temp, system_message)
1527
+ elif api_name == "Ooba":
1528
+ summary = summarize_with_oobabooga(chunk, custom_prompt_input, temp, system_message)
1529
+ elif api_name == "Tabbyapi":
1530
+ summary = summarize_with_tabbyapi(chunk, custom_prompt_input, temp, system_message)
1531
+ elif api_name == "VLLM":
1532
+ summary = summarize_with_vllm(chunk, custom_prompt_input, temp, system_message)
1533
+ summarized_chunk_transcriptions.append(summary)
1534
+
1535
+ # Combine chunked transcriptions into a single file
1536
+ combined_transcription_text = '\n\n'.join(chunked_transcriptions)
1537
+ combined_transcription_file_path = os.path.join(download_path, 'combined_transcription.txt')
1538
+ with open(combined_transcription_file_path, 'w') as f:
1539
+ f.write(combined_transcription_text)
1540
+
1541
+ # Combine summarized chunk transcriptions into a single file
1542
+ combined_summary_text = '\n\n'.join(summarized_chunk_transcriptions)
1543
+ combined_summary_file_path = os.path.join(download_path, 'combined_summary.txt')
1544
+ with open(combined_summary_file_path, 'w') as f:
1545
+ f.write(combined_summary_text)
1546
+
1547
+ # Handle rolling summarization
1548
+ if rolling_summarization:
1549
+ summary_text = rolling_summarize(
1550
+ text=extract_text_from_segments(segments),
1551
+ detail=detail_level,
1552
+ model='gpt-4-turbo',
1553
+ additional_instructions=custom_prompt_input,
1554
+ summarize_recursively=recursive_summarization
1555
+ )
1556
+ elif api_name:
1557
+ summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key,
1558
+ recursive_summarization, temp=None)
1559
+ else:
1560
+ summary_text = 'Summary not available'
1561
+
1562
+ # Check to see if chunking was performed, and if so, return that instead
1563
+ if chunk_text_by_words or chunk_text_by_sentences or chunk_text_by_paragraphs or chunk_text_by_tokens or chunk_by_semantic:
1564
+ # Combine chunked transcriptions into a single file
1565
+ # FIXME - validate this works....
1566
+ json_file_path, summary_file_path = save_transcription_and_summary(combined_transcription_file_path, combined_summary_file_path, download_path, info_dict)
1567
+ add_media_to_database(url, info_dict, segments, summary_text, keywords, custom_prompt_input, whisper_model)
1568
+ return transcription_text, summary_text, json_file_path, summary_file_path, None, None
1569
+ else:
1570
+ json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text, download_path, info_dict)
1571
+ add_media_to_database(url, info_dict, segments, summary_text, keywords, custom_prompt_input, whisper_model)
1572
+ return transcription_text, summary_text, json_file_path, summary_file_path, None, None
1573
+
1574
+ except Exception as e:
1575
+ logging.error(f": {e}")
1576
+ return str(e), 'process_url: Error processing the request.', None, None, None, None
1577
+
1578
+ #
1579
+ #
1580
+ ############################################################################################################################################
App_Function_Libraries/Summarization/__init__.py ADDED
File without changes