File size: 7,048 Bytes
ed28876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Old_Chunking_Lib.py
#########################################
# Old Chunking Library
# This library is used to handle chunking of text for summarization.
#
####
import logging
####################
# Function List
#
# 1. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
# 2. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
# 3. get_chat_completion(messages, model='gpt-4-turbo')
# 4. chunk_on_delimiter(input_string: str, max_tokens: int, delimiter: str) -> List[str]
# 5. combine_chunks_with_no_minimum(chunks: List[str], max_tokens: int, chunk_delimiter="\n\n", header: Optional[str] = None, add_ellipsis_for_overflow=False) -> Tuple[List[str], List[int]]
# 6. rolling_summarize(text: str, detail: float = 0, model: str = 'gpt-4-turbo', additional_instructions: Optional[str] = None, minimum_chunk_size: Optional[int] = 500, chunk_delimiter: str = ".", summarize_recursively=False, verbose=False)
# 7. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
# 8. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
#
####################

# Import necessary libraries
import os
from typing import Optional, List, Tuple
#
# Import 3rd party
from openai import OpenAI
from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
#
# Import Local
#
#######################################################################################################################
# Function Definitions
#

######### Words-per-second Chunking #########
def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
    words = transcript.split()
    words_per_chunk = chunk_duration * words_per_second
    chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
    return chunks


# def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
#                      words_per_second: int) -> str:
#     if api_name not in summarizers:  # See 'summarizers' dict in the main script
#         return f"Unsupported API: {api_name}"
#
#     summarizer = summarizers[api_name]
#     text = extract_text_from_segments(transcript)
#     chunks = chunk_transcript(text, chunk_duration, words_per_second)
#
#     summaries = []
#     for chunk in chunks:
#         if api_name == 'openai':
#             # Ensure the correct model and prompt are passed
#             summaries.append(summarizer(api_key, chunk, custom_prompt))
#         else:
#             summaries.append(summarizer(api_key, chunk))
#
#     return "\n\n".join(summaries)


################## ####################


######### Token-size Chunking ######### FIXME - OpenAI only currently
# This is dirty and shameful and terrible. It should be replaced with a proper implementation.
# anyways lets get to it....
openai_api_key = "Fake_key" # FIXME
client = OpenAI(api_key=openai_api_key)





# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
def chunk_on_delimiter(input_string: str,

                       max_tokens: int,

                       delimiter: str) -> List[str]:
    chunks = input_string.split(delimiter)
    combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
        chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
    if dropped_chunk_count > 0:
        print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
    combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
    return combined_chunks





#######################################


######### Words-per-second Chunking #########
# FIXME - WHole section needs to be re-written
def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
    words = transcript.split()
    words_per_chunk = chunk_duration * words_per_second
    chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
    return chunks


# def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
#                      words_per_second: int) -> str:
    # if api_name not in summarizers:  # See 'summarizers' dict in the main script
    #     return f"Unsupported API: {api_name}"
    #
    # if not transcript:
    #     logging.error("Empty or None transcript provided to summarize_chunks")
    #     return "Error: Empty or None transcript provided"
    #
    # text = extract_text_from_segments(transcript)
    # chunks = chunk_transcript(text, chunk_duration, words_per_second)
    #
    # #FIXME
    # custom_prompt = args.custom_prompt
    #
    # summaries = []
    # for chunk in chunks:
    #     if api_name == 'openai':
    #         # Ensure the correct model and prompt are passed
    #         summaries.append(summarize_with_openai(api_key, chunk, custom_prompt))
    #     elif api_name == 'anthropic':
    #         summaries.append(summarize_with_cohere(api_key, chunk, anthropic_model, custom_prompt))
    #     elif api_name == 'cohere':
    #         summaries.append(summarize_with_anthropic(api_key, chunk, cohere_model, custom_prompt))
    #     elif api_name == 'groq':
    #         summaries.append(summarize_with_groq(api_key, chunk, groq_model, custom_prompt))
    #     elif api_name == 'llama':
    #         summaries.append(summarize_with_llama(llama_api_IP, chunk, api_key, custom_prompt))
    #     elif api_name == 'kobold':
    #         summaries.append(summarize_with_kobold(kobold_api_IP, chunk, api_key, custom_prompt))
    #     elif api_name == 'ooba':
    #         summaries.append(summarize_with_oobabooga(ooba_api_IP, chunk, api_key, custom_prompt))
    #     elif api_name == 'tabbyapi':
    #         summaries.append(summarize_with_vllm(api_key, tabby_api_IP, chunk, summarize.llm_model, custom_prompt))
    #     elif api_name == 'local-llm':
    #         summaries.append(summarize_with_local_llm(chunk, custom_prompt))
    #     else:
    #         return f"Unsupported API: {api_name}"
    #
    # return "\n\n".join(summaries)

# FIXME - WHole section needs to be re-written
def summarize_with_detail_openai(text, detail, verbose=False):
    summary_with_detail_variable = rolling_summarize(text, detail=detail, verbose=True)
    print(len(openai_tokenize(summary_with_detail_variable)))
    return summary_with_detail_variable


def summarize_with_detail_recursive_openai(text, detail, verbose=False):
    summary_with_recursive_summarization = rolling_summarize(text, detail=detail, summarize_recursively=True)
    print(summary_with_recursive_summarization)

#
#
#################################################################################