oceansweep commited on
Commit
5d354c7
·
verified ·
1 Parent(s): c7e020d

Update App_Function_Libraries/Chunk_Lib.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Chunk_Lib.py +583 -467
App_Function_Libraries/Chunk_Lib.py CHANGED
@@ -1,467 +1,583 @@
1
- # Chunk_Lib.py
2
- #########################################
3
- # Chunking Library
4
- # This library is used to perform chunking of input files.
5
- # Currently, uses naive approaches. Nothing fancy.
6
- #
7
- ####
8
- # Import necessary libraries
9
- import logging
10
- import re
11
-
12
- from typing import List, Optional, Tuple, Dict, Any
13
-
14
- from openai import OpenAI
15
- from tqdm import tqdm
16
- #
17
- # Import 3rd party
18
- from transformers import GPT2Tokenizer
19
- import nltk
20
- from nltk.tokenize import sent_tokenize, word_tokenize
21
- from sklearn.feature_extraction.text import TfidfVectorizer
22
- from sklearn.metrics.pairwise import cosine_similarity
23
- #
24
- # Import Local
25
- from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
26
- from App_Function_Libraries.Utils import load_comprehensive_config
27
-
28
-
29
- #
30
- #######################################################################################################################
31
- # Function Definitions
32
- #
33
-
34
- # FIXME - Make sure it only downloads if it already exists, and does a check first.
35
- # Ensure NLTK data is downloaded
36
- def ntlk_prep():
37
- nltk.download('punkt')
38
-
39
- # Load GPT2 tokenizer
40
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
41
-
42
- # Load Config file for API keys
43
- config = load_comprehensive_config()
44
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
45
-
46
- def load_document(file_path):
47
- with open(file_path, 'r') as file:
48
- text = file.read()
49
- return re.sub('\\s+', ' ', text).strip()
50
-
51
-
52
- def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
53
- chunk_method = chunk_options.get('method', 'words')
54
- max_chunk_size = chunk_options.get('max_size', 300)
55
- overlap = chunk_options.get('overlap', 0)
56
- language = chunk_options.get('language', 'english')
57
- adaptive = chunk_options.get('adaptive', False)
58
- multi_level = chunk_options.get('multi_level', False)
59
-
60
- if adaptive:
61
- max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
62
-
63
- if multi_level:
64
- chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
65
- else:
66
- if chunk_method == 'words':
67
- chunks = chunk_text_by_words(text, max_chunk_size, overlap)
68
- elif chunk_method == 'sentences':
69
- chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
70
- elif chunk_method == 'paragraphs':
71
- chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
72
- elif chunk_method == 'tokens':
73
- chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
74
- else:
75
- chunks = [text] # No chunking applied
76
-
77
- return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
78
-
79
-
80
- def adaptive_chunk_size(text: str, base_size: int) -> int:
81
- # Simple adaptive logic: adjust chunk size based on text complexity
82
- avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
83
- if avg_word_length > 6: # Arbitrary threshold for "complex" text
84
- return int(base_size * 0.8) # Reduce chunk size for complex text
85
- return base_size
86
-
87
-
88
- def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
89
- # First level: chunk by paragraphs
90
- paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
91
-
92
- # Second level: chunk each paragraph further
93
- chunks = []
94
- for para in paragraphs:
95
- if method == 'words':
96
- chunks.extend(chunk_text_by_words(para, max_size, overlap))
97
- elif method == 'sentences':
98
- chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
99
- else:
100
- chunks.append(para)
101
-
102
- return chunks
103
-
104
-
105
- def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
106
- words = text.split()
107
- chunks = []
108
- for i in range(0, len(words), max_words - overlap):
109
- chunk = ' '.join(words[i:i + max_words])
110
- chunks.append(chunk)
111
- return post_process_chunks(chunks)
112
-
113
-
114
- def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
115
- str]:
116
- nltk.download('punkt', quiet=True)
117
- sentences = nltk.sent_tokenize(text, language=language)
118
- chunks = []
119
- for i in range(0, len(sentences), max_sentences - overlap):
120
- chunk = ' '.join(sentences[i:i + max_sentences])
121
- chunks.append(chunk)
122
- return post_process_chunks(chunks)
123
-
124
-
125
- def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
126
- paragraphs = re.split(r'\n\s*\n', text)
127
- chunks = []
128
- for i in range(0, len(paragraphs), max_paragraphs - overlap):
129
- chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
130
- chunks.append(chunk)
131
- return post_process_chunks(chunks)
132
-
133
-
134
- def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
135
- # This is a simplified token-based chunking. For more accurate tokenization,
136
- # consider using a proper tokenizer like GPT-2 TokenizerFast
137
- words = text.split()
138
- chunks = []
139
- current_chunk = []
140
- current_token_count = 0
141
-
142
- for word in words:
143
- word_token_count = len(word) // 4 + 1 # Rough estimate of token count
144
- if current_token_count + word_token_count > max_tokens and current_chunk:
145
- chunks.append(' '.join(current_chunk))
146
- current_chunk = current_chunk[-overlap:] if overlap > 0 else []
147
- current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
148
-
149
- current_chunk.append(word)
150
- current_token_count += word_token_count
151
-
152
- if current_chunk:
153
- chunks.append(' '.join(current_chunk))
154
-
155
- return post_process_chunks(chunks)
156
-
157
-
158
- def post_process_chunks(chunks: List[str]) -> List[str]:
159
- return [chunk.strip() for chunk in chunks if chunk.strip()]
160
-
161
-
162
- def get_chunk_metadata(chunk: str, full_text: str) -> Dict[str, Any]:
163
- start_index = full_text.index(chunk)
164
- return {
165
- 'start_index': start_index,
166
- 'end_index': start_index + len(chunk),
167
- 'word_count': len(chunk.split()),
168
- 'char_count': len(chunk)
169
- }
170
-
171
-
172
- # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
173
- def chunk_text_hybrid(text, max_tokens=1000):
174
- sentences = nltk.tokenize.sent_tokenize(text)
175
- chunks = []
176
- current_chunk = []
177
- current_length = 0
178
-
179
- for sentence in sentences:
180
- tokens = tokenizer.encode(sentence)
181
- if current_length + len(tokens) <= max_tokens:
182
- current_chunk.append(sentence)
183
- current_length += len(tokens)
184
- else:
185
- chunks.append(' '.join(current_chunk))
186
- current_chunk = [sentence]
187
- current_length = len(tokens)
188
-
189
- if current_chunk:
190
- chunks.append(' '.join(current_chunk))
191
-
192
- return chunks
193
-
194
- # Thanks openai
195
- def chunk_on_delimiter(input_string: str,
196
- max_tokens: int,
197
- delimiter: str) -> List[str]:
198
- chunks = input_string.split(delimiter)
199
- combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
200
- chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
201
- if dropped_chunk_count > 0:
202
- print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
203
- combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
204
- return combined_chunks
205
-
206
-
207
- def recursive_summarize_chunks(chunks, summarize_func, custom_prompt):
208
- summarized_chunks = []
209
- current_summary = ""
210
-
211
- for i, chunk in enumerate(chunks):
212
- if i == 0:
213
- current_summary = summarize_func(chunk, custom_prompt)
214
- else:
215
- combined_text = current_summary + "\n\n" + chunk
216
- current_summary = summarize_func(combined_text, custom_prompt)
217
-
218
- summarized_chunks.append(current_summary)
219
-
220
- return summarized_chunks
221
-
222
-
223
- # Sample text for testing
224
- sample_text = """
225
- Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
226
- concerned with the interactions between computers and human language, in particular how to program computers
227
- to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
228
- the contents of documents, including the contextual nuances of the language within them. The technology can then
229
- accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
230
-
231
- Challenges in natural language processing frequently involve speech recognition, natural language understanding,
232
- and natural language generation.
233
-
234
- Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
235
- "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
236
- """
237
-
238
- # Example usage of different chunking methods
239
- # print("Chunking by words:")
240
- # print(chunk_text_by_words(sample_text, max_words=50))
241
- #
242
- # print("\nChunking by sentences:")
243
- # print(chunk_text_by_sentences(sample_text, max_sentences=2))
244
- #
245
- # print("\nChunking by paragraphs:")
246
- # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
247
- #
248
- # print("\nChunking by tokens:")
249
- # print(chunk_text_by_tokens(sample_text, max_tokens=50))
250
- #
251
- # print("\nHybrid chunking:")
252
- # print(chunk_text_hybrid(sample_text, max_tokens=50))
253
-
254
-
255
-
256
- #######################################################################################################################
257
- #
258
- # Experimental Semantic Chunking
259
- #
260
-
261
- # Chunk text into segments based on semantic similarity
262
- def count_units(text, unit='tokens'):
263
- if unit == 'words':
264
- return len(text.split())
265
- elif unit == 'tokens':
266
- return len(word_tokenize(text))
267
- elif unit == 'characters':
268
- return len(text)
269
- else:
270
- raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
271
-
272
-
273
- def semantic_chunking(text, max_chunk_size=2000, unit='words'):
274
- nltk.download('punkt', quiet=True)
275
- sentences = sent_tokenize(text)
276
- vectorizer = TfidfVectorizer()
277
- sentence_vectors = vectorizer.fit_transform(sentences)
278
-
279
- chunks = []
280
- current_chunk = []
281
- current_size = 0
282
-
283
- for i, sentence in enumerate(sentences):
284
- sentence_size = count_units(sentence, unit)
285
- if current_size + sentence_size > max_chunk_size and current_chunk:
286
- chunks.append(' '.join(current_chunk))
287
- overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
288
- current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
289
- current_size = overlap_size
290
-
291
- current_chunk.append(sentence)
292
- current_size += sentence_size
293
-
294
- if i + 1 < len(sentences):
295
- current_vector = sentence_vectors[i]
296
- next_vector = sentence_vectors[i + 1]
297
- similarity = cosine_similarity(current_vector, next_vector)[0][0]
298
- if similarity < 0.5 and current_size >= max_chunk_size // 2:
299
- chunks.append(' '.join(current_chunk))
300
- overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
301
- current_chunk = current_chunk[-3:]
302
- current_size = overlap_size
303
-
304
- if current_chunk:
305
- chunks.append(' '.join(current_chunk))
306
-
307
- return chunks
308
-
309
-
310
- def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100):
311
- try:
312
- with open(file_path, 'r', encoding='utf-8') as file:
313
- content = file.read()
314
-
315
- chunks = semantic_chunking(content, max_chunk_size, overlap)
316
- return chunks
317
- except Exception as e:
318
- logging.error(f"Error chunking text file: {str(e)}")
319
- return None
320
- #######################################################################################################################
321
-
322
-
323
-
324
-
325
-
326
-
327
- #######################################################################################################################
328
- #
329
- # OpenAI Rolling Summarization
330
- #
331
-
332
- client = OpenAI(api_key=openai_api_key)
333
- def get_chat_completion(messages, model='gpt-4-turbo'):
334
- response = client.chat.completions.create(
335
- model=model,
336
- messages=messages,
337
- temperature=0,
338
- )
339
- return response.choices[0].message.content
340
-
341
-
342
- # This function combines text chunks into larger blocks without exceeding a specified token count.
343
- # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
344
- def combine_chunks_with_no_minimum(
345
- chunks: List[str],
346
- max_tokens: int,
347
- chunk_delimiter="\n\n",
348
- header: Optional[str] = None,
349
- add_ellipsis_for_overflow=False,
350
- ) -> Tuple[List[str], List[int]]:
351
- dropped_chunk_count = 0
352
- output = [] # list to hold the final combined chunks
353
- output_indices = [] # list to hold the indices of the final combined chunks
354
- candidate = (
355
- [] if header is None else [header]
356
- ) # list to hold the current combined chunk candidate
357
- candidate_indices = []
358
- for chunk_i, chunk in enumerate(chunks):
359
- chunk_with_header = [chunk] if header is None else [header, chunk]
360
- # FIXME MAKE NOT OPENAI SPECIFIC
361
- if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
362
- print(f"warning: chunk overflow")
363
- if (
364
- add_ellipsis_for_overflow
365
- # FIXME MAKE NOT OPENAI SPECIFIC
366
- and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
367
- ):
368
- candidate.append("...")
369
- dropped_chunk_count += 1
370
- continue # this case would break downstream assumptions
371
- # estimate token count with the current chunk added
372
- # FIXME MAKE NOT OPENAI SPECIFIC
373
- extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
374
- # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
375
- if extended_candidate_token_count > max_tokens:
376
- output.append(chunk_delimiter.join(candidate))
377
- output_indices.append(candidate_indices)
378
- candidate = chunk_with_header # re-initialize candidate
379
- candidate_indices = [chunk_i]
380
- # otherwise keep extending the candidate
381
- else:
382
- candidate.append(chunk)
383
- candidate_indices.append(chunk_i)
384
- # add the remaining candidate to output if it's not empty
385
- if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
386
- output.append(chunk_delimiter.join(candidate))
387
- output_indices.append(candidate_indices)
388
- return output, output_indices, dropped_chunk_count
389
-
390
-
391
- def rolling_summarize(text: str,
392
- detail: float = 0,
393
- model: str = 'gpt-4-turbo',
394
- additional_instructions: Optional[str] = None,
395
- minimum_chunk_size: Optional[int] = 500,
396
- chunk_delimiter: str = ".",
397
- summarize_recursively=False,
398
- verbose=False):
399
- """
400
- Summarizes a given text by splitting it into chunks, each of which is summarized individually.
401
- The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
402
-
403
- Parameters:
404
- - text (str): The text to be summarized.
405
- - detail (float, optional): A value between 0 and 1
406
- indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
407
- detailed summary. Defaults to 0.
408
- - additional_instructions (Optional[str], optional): Additional instructions to provide to the
409
- model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
410
- chunks. Defaults to 500.
411
- - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
412
- - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
413
- - verbose (bool, optional): If True, prints detailed information about the chunking process.
414
- Returns:
415
- - str: The final compiled summary of the text.
416
-
417
- The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
418
- based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
419
- `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
420
- summarization process. The function returns a compiled summary of all chunks.
421
- """
422
-
423
- # check detail is set correctly
424
- assert 0 <= detail <= 1
425
-
426
- # interpolate the number of chunks based to get specified level of detail
427
- max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
428
- min_chunks = 1
429
- num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
430
-
431
- # adjust chunk_size based on interpolated number of chunks
432
- # FIXME MAKE NOT OPENAI SPECIFIC
433
- document_length = len(openai_tokenize(text))
434
- chunk_size = max(minimum_chunk_size, document_length // num_chunks)
435
- text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
436
- if verbose:
437
- print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
438
- # FIXME MAKE NOT OPENAI SPECIFIC
439
- print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
440
-
441
- # set system message - FIXME
442
- system_message_content = "Rewrite this text in summarized form."
443
- if additional_instructions is not None:
444
- system_message_content += f"\n\n{additional_instructions}"
445
-
446
- accumulated_summaries = []
447
- for i, chunk in enumerate(tqdm(text_chunks)):
448
- if summarize_recursively and accumulated_summaries:
449
- # Combine previous summary with current chunk for recursive summarization
450
- combined_text = accumulated_summaries[-1] + "\n\n" + chunk
451
- user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
452
- else:
453
- user_message_content = chunk
454
-
455
- messages = [
456
- {"role": "system", "content": system_message_content},
457
- {"role": "user", "content": user_message_content}
458
- ]
459
-
460
- response = get_chat_completion(messages, model=model)
461
- accumulated_summaries.append(response)
462
-
463
- final_summary = '\n\n'.join(accumulated_summaries)
464
- return final_summary
465
-
466
-
467
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chunk_Lib.py
2
+ #########################################
3
+ # Chunking Library
4
+ # This library is used to perform chunking of input files.
5
+ # Currently, uses naive approaches. Nothing fancy.
6
+ #
7
+ ####
8
+ # Import necessary libraries
9
+ import logging
10
+ import re
11
+
12
+ from typing import List, Optional, Tuple, Dict, Any
13
+
14
+ from openai import OpenAI
15
+ from tqdm import tqdm
16
+ #
17
+ # Import 3rd party
18
+ from transformers import GPT2Tokenizer
19
+ import nltk
20
+ from nltk.tokenize import sent_tokenize, word_tokenize
21
+ from sklearn.feature_extraction.text import TfidfVectorizer
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+ #
24
+ # Import Local
25
+ from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
26
+ from App_Function_Libraries.Utils import load_comprehensive_config
27
+
28
+
29
+ #
30
+ #######################################################################################################################
31
+ # Function Definitions
32
+ #
33
+
34
+ # FIXME - Make sure it only downloads if it already exists, and does a check first.
35
+ # Ensure NLTK data is downloaded
36
+ def ntlk_prep():
37
+ nltk.download('punkt')
38
+
39
+ # Load GPT2 tokenizer
40
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
41
+
42
+ # Load Config file for API keys
43
+ config = load_comprehensive_config()
44
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
45
+
46
+ def load_document(file_path):
47
+ with open(file_path, 'r') as file:
48
+ text = file.read()
49
+ return re.sub('\\s+', ' ', text).strip()
50
+
51
+
52
+ def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
53
+ chunk_method = chunk_options.get('method', 'words')
54
+ max_chunk_size = chunk_options.get('max_size', 300)
55
+ overlap = chunk_options.get('overlap', 0)
56
+ language = chunk_options.get('language', 'english')
57
+ adaptive = chunk_options.get('adaptive', False)
58
+ multi_level = chunk_options.get('multi_level', False)
59
+
60
+ if adaptive:
61
+ max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
62
+
63
+ if multi_level:
64
+ chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
65
+ else:
66
+ if chunk_method == 'words':
67
+ chunks = chunk_text_by_words(text, max_chunk_size, overlap)
68
+ elif chunk_method == 'sentences':
69
+ chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
70
+ elif chunk_method == 'paragraphs':
71
+ chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
72
+ elif chunk_method == 'tokens':
73
+ chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
74
+ elif chunk_method == 'chapters':
75
+ return chunk_ebook_by_chapters(text, chunk_options)
76
+ else:
77
+ # No chunking applied
78
+ chunks = [text]
79
+
80
+ return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
81
+
82
+
83
+ def adaptive_chunk_size(text: str, base_size: int) -> int:
84
+ # Simple adaptive logic: adjust chunk size based on text complexity
85
+ avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
86
+ if avg_word_length > 6: # Arbitrary threshold for "complex" text
87
+ return int(base_size * 0.8) # Reduce chunk size for complex text
88
+ return base_size
89
+
90
+
91
+ def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
92
+ # First level: chunk by paragraphs
93
+ paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
94
+
95
+ # Second level: chunk each paragraph further
96
+ chunks = []
97
+ for para in paragraphs:
98
+ if method == 'words':
99
+ chunks.extend(chunk_text_by_words(para, max_size, overlap))
100
+ elif method == 'sentences':
101
+ chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
102
+ else:
103
+ chunks.append(para)
104
+
105
+ return chunks
106
+
107
+
108
+ def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
109
+ words = text.split()
110
+ chunks = []
111
+ for i in range(0, len(words), max_words - overlap):
112
+ chunk = ' '.join(words[i:i + max_words])
113
+ chunks.append(chunk)
114
+ return post_process_chunks(chunks)
115
+
116
+
117
+ def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
118
+ str]:
119
+ nltk.download('punkt', quiet=True)
120
+ sentences = nltk.sent_tokenize(text, language=language)
121
+ chunks = []
122
+ for i in range(0, len(sentences), max_sentences - overlap):
123
+ chunk = ' '.join(sentences[i:i + max_sentences])
124
+ chunks.append(chunk)
125
+ return post_process_chunks(chunks)
126
+
127
+
128
+ def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
129
+ paragraphs = re.split(r'\n\s*\n', text)
130
+ chunks = []
131
+ for i in range(0, len(paragraphs), max_paragraphs - overlap):
132
+ chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
133
+ chunks.append(chunk)
134
+ return post_process_chunks(chunks)
135
+
136
+
137
+ def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
138
+ # This is a simplified token-based chunking. For more accurate tokenization,
139
+ # consider using a proper tokenizer like GPT-2 TokenizerFast
140
+ words = text.split()
141
+ chunks = []
142
+ current_chunk = []
143
+ current_token_count = 0
144
+
145
+ for word in words:
146
+ word_token_count = len(word) // 4 + 1 # Rough estimate of token count
147
+ if current_token_count + word_token_count > max_tokens and current_chunk:
148
+ chunks.append(' '.join(current_chunk))
149
+ current_chunk = current_chunk[-overlap:] if overlap > 0 else []
150
+ current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
151
+
152
+ current_chunk.append(word)
153
+ current_token_count += word_token_count
154
+
155
+ if current_chunk:
156
+ chunks.append(' '.join(current_chunk))
157
+
158
+ return post_process_chunks(chunks)
159
+
160
+
161
+ def post_process_chunks(chunks: List[str]) -> List[str]:
162
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
163
+
164
+
165
+ def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]:
166
+ start_index = full_text.index(chunk)
167
+ metadata = {
168
+ 'start_index': start_index,
169
+ 'end_index': start_index + len(chunk),
170
+ 'word_count': len(chunk.split()),
171
+ 'char_count': len(chunk),
172
+ 'chunk_type': chunk_type
173
+ }
174
+ if chunk_type == "chapter":
175
+ metadata['chapter_number'] = chapter_number
176
+ metadata['chapter_pattern'] = chapter_pattern
177
+ return metadata
178
+
179
+
180
+ # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
181
+ def chunk_text_hybrid(text, max_tokens=1000):
182
+ sentences = nltk.tokenize.sent_tokenize(text)
183
+ chunks = []
184
+ current_chunk = []
185
+ current_length = 0
186
+
187
+ for sentence in sentences:
188
+ tokens = tokenizer.encode(sentence)
189
+ if current_length + len(tokens) <= max_tokens:
190
+ current_chunk.append(sentence)
191
+ current_length += len(tokens)
192
+ else:
193
+ chunks.append(' '.join(current_chunk))
194
+ current_chunk = [sentence]
195
+ current_length = len(tokens)
196
+
197
+ if current_chunk:
198
+ chunks.append(' '.join(current_chunk))
199
+
200
+ return chunks
201
+
202
+ # Thanks openai
203
+ def chunk_on_delimiter(input_string: str,
204
+ max_tokens: int,
205
+ delimiter: str) -> List[str]:
206
+ chunks = input_string.split(delimiter)
207
+ combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
208
+ chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
209
+ if dropped_chunk_count > 0:
210
+ print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
211
+ combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
212
+ return combined_chunks
213
+
214
+ # ????FIXME
215
+ def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None):
216
+ summarized_chunks = []
217
+ current_summary = ""
218
+
219
+ logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...")
220
+ logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}")
221
+ for i, chunk in enumerate(chunks):
222
+ if i == 0:
223
+ current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
224
+ else:
225
+ combined_text = current_summary + "\n\n" + chunk
226
+ current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
227
+
228
+ summarized_chunks.append(current_summary)
229
+
230
+ return summarized_chunks
231
+
232
+
233
+ # Sample text for testing
234
+ sample_text = """
235
+ Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
236
+ concerned with the interactions between computers and human language, in particular how to program computers
237
+ to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
238
+ the contents of documents, including the contextual nuances of the language within them. The technology can then
239
+ accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
240
+
241
+ Challenges in natural language processing frequently involve speech recognition, natural language understanding,
242
+ and natural language generation.
243
+
244
+ Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
245
+ "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
246
+ """
247
+
248
+ # Example usage of different chunking methods
249
+ # print("Chunking by words:")
250
+ # print(chunk_text_by_words(sample_text, max_words=50))
251
+ #
252
+ # print("\nChunking by sentences:")
253
+ # print(chunk_text_by_sentences(sample_text, max_sentences=2))
254
+ #
255
+ # print("\nChunking by paragraphs:")
256
+ # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
257
+ #
258
+ # print("\nChunking by tokens:")
259
+ # print(chunk_text_by_tokens(sample_text, max_tokens=50))
260
+ #
261
+ # print("\nHybrid chunking:")
262
+ # print(chunk_text_hybrid(sample_text, max_tokens=50))
263
+
264
+
265
+
266
+ #######################################################################################################################
267
+ #
268
+ # Experimental Semantic Chunking
269
+ #
270
+
271
+ # Chunk text into segments based on semantic similarity
272
+ def count_units(text, unit='tokens'):
273
+ if unit == 'words':
274
+ return len(text.split())
275
+ elif unit == 'tokens':
276
+ return len(word_tokenize(text))
277
+ elif unit == 'characters':
278
+ return len(text)
279
+ else:
280
+ raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
281
+
282
+
283
+ def semantic_chunking(text, max_chunk_size=2000, unit='words'):
284
+ nltk.download('punkt', quiet=True)
285
+ sentences = sent_tokenize(text)
286
+ vectorizer = TfidfVectorizer()
287
+ sentence_vectors = vectorizer.fit_transform(sentences)
288
+
289
+ chunks = []
290
+ current_chunk = []
291
+ current_size = 0
292
+
293
+ for i, sentence in enumerate(sentences):
294
+ sentence_size = count_units(sentence, unit)
295
+ if current_size + sentence_size > max_chunk_size and current_chunk:
296
+ chunks.append(' '.join(current_chunk))
297
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
298
+ current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
299
+ current_size = overlap_size
300
+
301
+ current_chunk.append(sentence)
302
+ current_size += sentence_size
303
+
304
+ if i + 1 < len(sentences):
305
+ current_vector = sentence_vectors[i]
306
+ next_vector = sentence_vectors[i + 1]
307
+ similarity = cosine_similarity(current_vector, next_vector)[0][0]
308
+ if similarity < 0.5 and current_size >= max_chunk_size // 2:
309
+ chunks.append(' '.join(current_chunk))
310
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
311
+ current_chunk = current_chunk[-3:]
312
+ current_size = overlap_size
313
+
314
+ if current_chunk:
315
+ chunks.append(' '.join(current_chunk))
316
+
317
+ return chunks
318
+
319
+
320
+ def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100):
321
+ try:
322
+ with open(file_path, 'r', encoding='utf-8') as file:
323
+ content = file.read()
324
+
325
+ chunks = semantic_chunking(content, max_chunk_size, overlap)
326
+ return chunks
327
+ except Exception as e:
328
+ logging.error(f"Error chunking text file: {str(e)}")
329
+ return None
330
+ #######################################################################################################################
331
+
332
+
333
+
334
+
335
+
336
+
337
+ #######################################################################################################################
338
+ #
339
+ # OpenAI Rolling Summarization
340
+ #
341
+
342
+ client = OpenAI(api_key=openai_api_key)
343
+ def get_chat_completion(messages, model='gpt-4-turbo'):
344
+ response = client.chat.completions.create(
345
+ model=model,
346
+ messages=messages,
347
+ temperature=0,
348
+ )
349
+ return response.choices[0].message.content
350
+
351
+
352
+ # This function combines text chunks into larger blocks without exceeding a specified token count.
353
+ # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
354
+ def combine_chunks_with_no_minimum(
355
+ chunks: List[str],
356
+ max_tokens: int,
357
+ chunk_delimiter="\n\n",
358
+ header: Optional[str] = None,
359
+ add_ellipsis_for_overflow=False,
360
+ ) -> Tuple[List[str], List[int]]:
361
+ dropped_chunk_count = 0
362
+ output = [] # list to hold the final combined chunks
363
+ output_indices = [] # list to hold the indices of the final combined chunks
364
+ candidate = (
365
+ [] if header is None else [header]
366
+ ) # list to hold the current combined chunk candidate
367
+ candidate_indices = []
368
+ for chunk_i, chunk in enumerate(chunks):
369
+ chunk_with_header = [chunk] if header is None else [header, chunk]
370
+ # FIXME MAKE NOT OPENAI SPECIFIC
371
+ if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
372
+ print(f"warning: chunk overflow")
373
+ if (
374
+ add_ellipsis_for_overflow
375
+ # FIXME MAKE NOT OPENAI SPECIFIC
376
+ and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
377
+ ):
378
+ candidate.append("...")
379
+ dropped_chunk_count += 1
380
+ continue # this case would break downstream assumptions
381
+ # estimate token count with the current chunk added
382
+ # FIXME MAKE NOT OPENAI SPECIFIC
383
+ extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
384
+ # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
385
+ if extended_candidate_token_count > max_tokens:
386
+ output.append(chunk_delimiter.join(candidate))
387
+ output_indices.append(candidate_indices)
388
+ candidate = chunk_with_header # re-initialize candidate
389
+ candidate_indices = [chunk_i]
390
+ # otherwise keep extending the candidate
391
+ else:
392
+ candidate.append(chunk)
393
+ candidate_indices.append(chunk_i)
394
+ # add the remaining candidate to output if it's not empty
395
+ if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
396
+ output.append(chunk_delimiter.join(candidate))
397
+ output_indices.append(candidate_indices)
398
+ return output, output_indices, dropped_chunk_count
399
+
400
+
401
+ def rolling_summarize(text: str,
402
+ detail: float = 0,
403
+ model: str = 'gpt-4-turbo',
404
+ additional_instructions: Optional[str] = None,
405
+ minimum_chunk_size: Optional[int] = 500,
406
+ chunk_delimiter: str = ".",
407
+ summarize_recursively=False,
408
+ verbose=False):
409
+ """
410
+ Summarizes a given text by splitting it into chunks, each of which is summarized individually.
411
+ The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
412
+
413
+ Parameters:
414
+ - text (str): The text to be summarized.
415
+ - detail (float, optional): A value between 0 and 1
416
+ indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
417
+ detailed summary. Defaults to 0.
418
+ - additional_instructions (Optional[str], optional): Additional instructions to provide to the
419
+ model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
420
+ chunks. Defaults to 500.
421
+ - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
422
+ - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
423
+ - verbose (bool, optional): If True, prints detailed information about the chunking process.
424
+ Returns:
425
+ - str: The final compiled summary of the text.
426
+
427
+ The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
428
+ based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
429
+ `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
430
+ summarization process. The function returns a compiled summary of all chunks.
431
+ """
432
+
433
+ # check detail is set correctly
434
+ assert 0 <= detail <= 1
435
+
436
+ # interpolate the number of chunks based to get specified level of detail
437
+ max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
438
+ min_chunks = 1
439
+ num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
440
+
441
+ # adjust chunk_size based on interpolated number of chunks
442
+ # FIXME MAKE NOT OPENAI SPECIFIC
443
+ document_length = len(openai_tokenize(text))
444
+ chunk_size = max(minimum_chunk_size, document_length // num_chunks)
445
+ text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
446
+ if verbose:
447
+ print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
448
+ # FIXME MAKE NOT OPENAI SPECIFIC
449
+ print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
450
+
451
+ # set system message - FIXME
452
+ system_message_content = "Rewrite this text in summarized form."
453
+ if additional_instructions is not None:
454
+ system_message_content += f"\n\n{additional_instructions}"
455
+
456
+ accumulated_summaries = []
457
+ for i, chunk in enumerate(tqdm(text_chunks)):
458
+ if summarize_recursively and accumulated_summaries:
459
+ # Combine previous summary with current chunk for recursive summarization
460
+ combined_text = accumulated_summaries[-1] + "\n\n" + chunk
461
+ user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
462
+ else:
463
+ user_message_content = chunk
464
+
465
+ messages = [
466
+ {"role": "system", "content": system_message_content},
467
+ {"role": "user", "content": user_message_content}
468
+ ]
469
+
470
+ response = get_chat_completion(messages, model=model)
471
+ accumulated_summaries.append(response)
472
+
473
+ final_summary = '\n\n'.join(accumulated_summaries)
474
+ return final_summary
475
+
476
+ #
477
+ #
478
+ #######################################################################################################################
479
+ #
480
+ # Ebook Chapter Chunking
481
+
482
+
483
+ def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
484
+ max_chunk_size = chunk_options.get('max_size', 300)
485
+ overlap = chunk_options.get('overlap', 0)
486
+ custom_pattern = chunk_options.get('custom_chapter_pattern', None)
487
+
488
+ # List of chapter heading patterns to try, in order
489
+ chapter_patterns = [
490
+ custom_pattern,
491
+ r'^#{1,2}\s+', # Markdown style: '# ' or '## '
492
+ r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
493
+ r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
494
+ r'^[A-Z\s]+$' # All caps headings
495
+ ]
496
+
497
+ chapter_positions = []
498
+ used_pattern = None
499
+
500
+ for pattern in chapter_patterns:
501
+ if pattern is None:
502
+ continue
503
+ chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
504
+ chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
505
+ if chapter_positions:
506
+ used_pattern = pattern
507
+ break
508
+
509
+ # If no chapters found, return the entire content as one chunk
510
+ if not chapter_positions:
511
+ return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}]
512
+
513
+ # Split content into chapters
514
+ chunks = []
515
+ for i in range(len(chapter_positions)):
516
+ start = chapter_positions[i]
517
+ end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
518
+ chapter = text[start:end]
519
+
520
+ # Apply overlap if specified
521
+ if overlap > 0 and i > 0:
522
+ overlap_start = max(0, start - overlap)
523
+ chapter = text[overlap_start:end]
524
+
525
+ chunks.append(chapter)
526
+
527
+ # Post-process chunks
528
+ processed_chunks = post_process_chunks(chunks)
529
+
530
+ # Add metadata to chunks
531
+ return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1,
532
+ chapter_pattern=used_pattern)}
533
+ for i, chunk in enumerate(processed_chunks)]
534
+
535
+
536
+ # # Example usage
537
+ # if __name__ == "__main__":
538
+ # sample_ebook_content = """
539
+ # # Chapter 1: Introduction
540
+ #
541
+ # This is the introduction.
542
+ #
543
+ # ## Section 1.1
544
+ #
545
+ # Some content here.
546
+ #
547
+ # # Chapter 2: Main Content
548
+ #
549
+ # This is the main content.
550
+ #
551
+ # ## Section 2.1
552
+ #
553
+ # More content here.
554
+ #
555
+ # CHAPTER THREE
556
+ #
557
+ # This is the third chapter.
558
+ #
559
+ # 4. Fourth Chapter
560
+ #
561
+ # This is the fourth chapter.
562
+ # """
563
+ #
564
+ # chunk_options = {
565
+ # 'method': 'chapters',
566
+ # 'max_size': 500,
567
+ # 'overlap': 50,
568
+ # 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style
569
+ # }
570
+ #
571
+ # chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options)
572
+ #
573
+ # for i, chunk in enumerate(chunked_chapters, 1):
574
+ # print(f"Chunk {i}:")
575
+ # print(chunk['text'])
576
+ # print(f"Metadata: {chunk['metadata']}\n")
577
+
578
+
579
+
580
+
581
+ #
582
+ # End of Chunking Library
583
+ #######################################################################################################################