oceansweep commited on
Commit
9c56866
1 Parent(s): 21f6a30

Upload Chunk_Lib.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Chunk_Lib.py +848 -598
App_Function_Libraries/Chunk_Lib.py CHANGED
@@ -1,599 +1,849 @@
1
- # Chunk_Lib.py
2
- #########################################
3
- # Chunking Library
4
- # This library is used to perform chunking of input files.
5
- # Currently, uses naive approaches. Nothing fancy.
6
- #
7
- ####
8
- # Import necessary libraries
9
- import logging
10
- import re
11
-
12
- from typing import List, Optional, Tuple, Dict, Any
13
-
14
- from openai import OpenAI
15
- from tqdm import tqdm
16
- #
17
- # Import 3rd party
18
- from transformers import GPT2Tokenizer
19
- import nltk
20
- from nltk.tokenize import sent_tokenize, word_tokenize
21
- from sklearn.feature_extraction.text import TfidfVectorizer
22
- from sklearn.metrics.pairwise import cosine_similarity
23
- #
24
- # Import Local
25
- from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
26
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
27
-
28
-
29
- #
30
- #######################################################################################################################
31
- # Function Definitions
32
- #
33
-
34
- # FIXME - Make sure it only downloads if it already exists, and does a check first.
35
- # Ensure NLTK data is downloaded
36
- def ntlk_prep():
37
- nltk.download('punkt')
38
-
39
- # Load GPT2 tokenizer
40
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
41
-
42
- # Load Config file for API keys
43
- config = load_comprehensive_config()
44
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
45
-
46
- def load_document(file_path):
47
- with open(file_path, 'r') as file:
48
- text = file.read()
49
- return re.sub('\\s+', ' ', text).strip()
50
-
51
- # Load configuration
52
- config = load_comprehensive_config()
53
- # Embedding Chunking options
54
- chunk_options = {
55
- 'method': config.get('Chunking', 'method', fallback='words'),
56
- 'max_size': config.getint('Chunking', 'max_size', fallback=400),
57
- 'overlap': config.getint('Chunking', 'overlap', fallback=200),
58
- 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
59
- 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
60
- 'language': config.get('Chunking', 'language', fallback='english')
61
- }
62
-
63
-
64
- def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
65
- chunk_method = chunk_options.get('method', 'words')
66
- max_chunk_size = chunk_options.get('max_size', 300)
67
- overlap = chunk_options.get('overlap', 0)
68
- language = chunk_options.get('language', 'english')
69
- adaptive = chunk_options.get('adaptive', False)
70
- multi_level = chunk_options.get('multi_level', False)
71
-
72
- if adaptive:
73
- max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
74
-
75
- if multi_level:
76
- chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
77
- else:
78
- if chunk_method == 'words':
79
- chunks = chunk_text_by_words(text, max_chunk_size, overlap)
80
- elif chunk_method == 'sentences':
81
- chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
82
- elif chunk_method == 'paragraphs':
83
- chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
84
- elif chunk_method == 'tokens':
85
- chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
86
- elif chunk_method == 'chapters':
87
- return chunk_ebook_by_chapters(text, chunk_options)
88
- else:
89
- # No chunking applied
90
- chunks = [text]
91
-
92
- return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
93
-
94
-
95
- def adaptive_chunk_size(text: str, base_size: int) -> int:
96
- # Simple adaptive logic: adjust chunk size based on text complexity
97
- avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
98
- if avg_word_length > 6: # Arbitrary threshold for "complex" text
99
- return int(base_size * 0.8) # Reduce chunk size for complex text
100
- return base_size
101
-
102
-
103
- def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
104
- # First level: chunk by paragraphs
105
- paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
106
-
107
- # Second level: chunk each paragraph further
108
- chunks = []
109
- for para in paragraphs:
110
- if method == 'words':
111
- chunks.extend(chunk_text_by_words(para, max_size, overlap))
112
- elif method == 'sentences':
113
- chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
114
- else:
115
- chunks.append(para)
116
-
117
- return chunks
118
-
119
-
120
- def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
121
- words = text.split()
122
- chunks = []
123
- for i in range(0, len(words), max_words - overlap):
124
- chunk = ' '.join(words[i:i + max_words])
125
- chunks.append(chunk)
126
- return post_process_chunks(chunks)
127
-
128
-
129
- def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
130
- str]:
131
- nltk.download('punkt', quiet=True)
132
- sentences = nltk.sent_tokenize(text, language=language)
133
- chunks = []
134
- for i in range(0, len(sentences), max_sentences - overlap):
135
- chunk = ' '.join(sentences[i:i + max_sentences])
136
- chunks.append(chunk)
137
- return post_process_chunks(chunks)
138
-
139
-
140
- def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
141
- paragraphs = re.split(r'\n\s*\n', text)
142
- chunks = []
143
- for i in range(0, len(paragraphs), max_paragraphs - overlap):
144
- chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
145
- chunks.append(chunk)
146
- return post_process_chunks(chunks)
147
-
148
-
149
- def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
150
- # This is a simplified token-based chunking. For more accurate tokenization,
151
- # consider using a proper tokenizer like GPT-2 TokenizerFast
152
- words = text.split()
153
- chunks = []
154
- current_chunk = []
155
- current_token_count = 0
156
-
157
- for word in words:
158
- word_token_count = len(word) // 4 + 1 # Rough estimate of token count
159
- if current_token_count + word_token_count > max_tokens and current_chunk:
160
- chunks.append(' '.join(current_chunk))
161
- current_chunk = current_chunk[-overlap:] if overlap > 0 else []
162
- current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
163
-
164
- current_chunk.append(word)
165
- current_token_count += word_token_count
166
-
167
- if current_chunk:
168
- chunks.append(' '.join(current_chunk))
169
-
170
- return post_process_chunks(chunks)
171
-
172
-
173
- def post_process_chunks(chunks: List[str]) -> List[str]:
174
- return [chunk.strip() for chunk in chunks if chunk.strip()]
175
-
176
-
177
- def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]:
178
- try:
179
- start_index = full_text.index(chunk)
180
- metadata = {
181
- 'start_index': start_index,
182
- 'end_index': start_index + len(chunk),
183
- 'word_count': len(chunk.split()),
184
- 'char_count': len(chunk),
185
- 'chunk_type': chunk_type
186
- }
187
- if chunk_type == "chapter":
188
- metadata['chapter_number'] = chapter_number
189
- metadata['chapter_pattern'] = chapter_pattern
190
- return metadata
191
- except ValueError as e:
192
- logging.error(f"Chunk not found in full_text: {chunk[:50]}... Full text length: {len(full_text)}")
193
- raise
194
-
195
-
196
- # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
197
- def chunk_text_hybrid(text, max_tokens=1000):
198
- sentences = nltk.tokenize.sent_tokenize(text)
199
- chunks = []
200
- current_chunk = []
201
- current_length = 0
202
-
203
- for sentence in sentences:
204
- tokens = tokenizer.encode(sentence)
205
- if current_length + len(tokens) <= max_tokens:
206
- current_chunk.append(sentence)
207
- current_length += len(tokens)
208
- else:
209
- chunks.append(' '.join(current_chunk))
210
- current_chunk = [sentence]
211
- current_length = len(tokens)
212
-
213
- if current_chunk:
214
- chunks.append(' '.join(current_chunk))
215
-
216
- return chunks
217
-
218
- # Thanks openai
219
- def chunk_on_delimiter(input_string: str,
220
- max_tokens: int,
221
- delimiter: str) -> List[str]:
222
- chunks = input_string.split(delimiter)
223
- combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
224
- chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
225
- if dropped_chunk_count > 0:
226
- print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
227
- combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
228
- return combined_chunks
229
-
230
- # ????FIXME
231
- def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None):
232
- summarized_chunks = []
233
- current_summary = ""
234
-
235
- logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...")
236
- logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}")
237
- for i, chunk in enumerate(chunks):
238
- if i == 0:
239
- current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
240
- else:
241
- combined_text = current_summary + "\n\n" + chunk
242
- current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
243
-
244
- summarized_chunks.append(current_summary)
245
-
246
- return summarized_chunks
247
-
248
-
249
- # Sample text for testing
250
- sample_text = """
251
- Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
252
- concerned with the interactions between computers and human language, in particular how to program computers
253
- to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
254
- the contents of documents, including the contextual nuances of the language within them. The technology can then
255
- accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
256
-
257
- Challenges in natural language processing frequently involve speech recognition, natural language understanding,
258
- and natural language generation.
259
-
260
- Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
261
- "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
262
- """
263
-
264
- # Example usage of different chunking methods
265
- # print("Chunking by words:")
266
- # print(chunk_text_by_words(sample_text, max_words=50))
267
- #
268
- # print("\nChunking by sentences:")
269
- # print(chunk_text_by_sentences(sample_text, max_sentences=2))
270
- #
271
- # print("\nChunking by paragraphs:")
272
- # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
273
- #
274
- # print("\nChunking by tokens:")
275
- # print(chunk_text_by_tokens(sample_text, max_tokens=50))
276
- #
277
- # print("\nHybrid chunking:")
278
- # print(chunk_text_hybrid(sample_text, max_tokens=50))
279
-
280
-
281
-
282
- #######################################################################################################################
283
- #
284
- # Experimental Semantic Chunking
285
- #
286
-
287
- # Chunk text into segments based on semantic similarity
288
- def count_units(text, unit='tokens'):
289
- if unit == 'words':
290
- return len(text.split())
291
- elif unit == 'tokens':
292
- return len(word_tokenize(text))
293
- elif unit == 'characters':
294
- return len(text)
295
- else:
296
- raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
297
-
298
-
299
- def semantic_chunking(text, max_chunk_size=2000, unit='words'):
300
- nltk.download('punkt', quiet=True)
301
- sentences = sent_tokenize(text)
302
- vectorizer = TfidfVectorizer()
303
- sentence_vectors = vectorizer.fit_transform(sentences)
304
-
305
- chunks = []
306
- current_chunk = []
307
- current_size = 0
308
-
309
- for i, sentence in enumerate(sentences):
310
- sentence_size = count_units(sentence, unit)
311
- if current_size + sentence_size > max_chunk_size and current_chunk:
312
- chunks.append(' '.join(current_chunk))
313
- overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
314
- current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
315
- current_size = overlap_size
316
-
317
- current_chunk.append(sentence)
318
- current_size += sentence_size
319
-
320
- if i + 1 < len(sentences):
321
- current_vector = sentence_vectors[i]
322
- next_vector = sentence_vectors[i + 1]
323
- similarity = cosine_similarity(current_vector, next_vector)[0][0]
324
- if similarity < 0.5 and current_size >= max_chunk_size // 2:
325
- chunks.append(' '.join(current_chunk))
326
- overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
327
- current_chunk = current_chunk[-3:]
328
- current_size = overlap_size
329
-
330
- if current_chunk:
331
- chunks.append(' '.join(current_chunk))
332
-
333
- return chunks
334
-
335
-
336
- def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100):
337
- try:
338
- with open(file_path, 'r', encoding='utf-8') as file:
339
- content = file.read()
340
-
341
- chunks = semantic_chunking(content, max_chunk_size, overlap)
342
- return chunks
343
- except Exception as e:
344
- logging.error(f"Error chunking text file: {str(e)}")
345
- return None
346
- #######################################################################################################################
347
-
348
-
349
-
350
-
351
-
352
-
353
- #######################################################################################################################
354
- #
355
- # OpenAI Rolling Summarization
356
- #
357
-
358
- client = OpenAI(api_key=openai_api_key)
359
- def get_chat_completion(messages, model='gpt-4-turbo'):
360
- response = client.chat.completions.create(
361
- model=model,
362
- messages=messages,
363
- temperature=0,
364
- )
365
- return response.choices[0].message.content
366
-
367
-
368
- # This function combines text chunks into larger blocks without exceeding a specified token count.
369
- # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
370
- def combine_chunks_with_no_minimum(
371
- chunks: List[str],
372
- max_tokens: int,
373
- chunk_delimiter="\n\n",
374
- header: Optional[str] = None,
375
- add_ellipsis_for_overflow=False,
376
- ) -> Tuple[List[str], List[int]]:
377
- dropped_chunk_count = 0
378
- output = [] # list to hold the final combined chunks
379
- output_indices = [] # list to hold the indices of the final combined chunks
380
- candidate = (
381
- [] if header is None else [header]
382
- ) # list to hold the current combined chunk candidate
383
- candidate_indices = []
384
- for chunk_i, chunk in enumerate(chunks):
385
- chunk_with_header = [chunk] if header is None else [header, chunk]
386
- # FIXME MAKE NOT OPENAI SPECIFIC
387
- if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
388
- print(f"warning: chunk overflow")
389
- if (
390
- add_ellipsis_for_overflow
391
- # FIXME MAKE NOT OPENAI SPECIFIC
392
- and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
393
- ):
394
- candidate.append("...")
395
- dropped_chunk_count += 1
396
- continue # this case would break downstream assumptions
397
- # estimate token count with the current chunk added
398
- # FIXME MAKE NOT OPENAI SPECIFIC
399
- extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
400
- # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
401
- if extended_candidate_token_count > max_tokens:
402
- output.append(chunk_delimiter.join(candidate))
403
- output_indices.append(candidate_indices)
404
- candidate = chunk_with_header # re-initialize candidate
405
- candidate_indices = [chunk_i]
406
- # otherwise keep extending the candidate
407
- else:
408
- candidate.append(chunk)
409
- candidate_indices.append(chunk_i)
410
- # add the remaining candidate to output if it's not empty
411
- if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
412
- output.append(chunk_delimiter.join(candidate))
413
- output_indices.append(candidate_indices)
414
- return output, output_indices, dropped_chunk_count
415
-
416
-
417
- def rolling_summarize(text: str,
418
- detail: float = 0,
419
- model: str = 'gpt-4-turbo',
420
- additional_instructions: Optional[str] = None,
421
- minimum_chunk_size: Optional[int] = 500,
422
- chunk_delimiter: str = ".",
423
- summarize_recursively=False,
424
- verbose=False):
425
- """
426
- Summarizes a given text by splitting it into chunks, each of which is summarized individually.
427
- The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
428
-
429
- Parameters:
430
- - text (str): The text to be summarized.
431
- - detail (float, optional): A value between 0 and 1
432
- indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
433
- detailed summary. Defaults to 0.
434
- - additional_instructions (Optional[str], optional): Additional instructions to provide to the
435
- model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
436
- chunks. Defaults to 500.
437
- - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
438
- - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
439
- - verbose (bool, optional): If True, prints detailed information about the chunking process.
440
- Returns:
441
- - str: The final compiled summary of the text.
442
-
443
- The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
444
- based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
445
- `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
446
- summarization process. The function returns a compiled summary of all chunks.
447
- """
448
-
449
- # check detail is set correctly
450
- assert 0 <= detail <= 1
451
-
452
- # interpolate the number of chunks based to get specified level of detail
453
- max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
454
- min_chunks = 1
455
- num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
456
-
457
- # adjust chunk_size based on interpolated number of chunks
458
- # FIXME MAKE NOT OPENAI SPECIFIC
459
- document_length = len(openai_tokenize(text))
460
- chunk_size = max(minimum_chunk_size, document_length // num_chunks)
461
- text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
462
- if verbose:
463
- print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
464
- # FIXME MAKE NOT OPENAI SPECIFIC
465
- print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
466
-
467
- # set system message - FIXME
468
- system_message_content = "Rewrite this text in summarized form."
469
- if additional_instructions is not None:
470
- system_message_content += f"\n\n{additional_instructions}"
471
-
472
- accumulated_summaries = []
473
- for i, chunk in enumerate(tqdm(text_chunks)):
474
- if summarize_recursively and accumulated_summaries:
475
- # Combine previous summary with current chunk for recursive summarization
476
- combined_text = accumulated_summaries[-1] + "\n\n" + chunk
477
- user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
478
- else:
479
- user_message_content = chunk
480
-
481
- messages = [
482
- {"role": "system", "content": system_message_content},
483
- {"role": "user", "content": user_message_content}
484
- ]
485
-
486
- response = get_chat_completion(messages, model=model)
487
- accumulated_summaries.append(response)
488
-
489
- final_summary = '\n\n'.join(accumulated_summaries)
490
- return final_summary
491
-
492
- #
493
- #
494
- #######################################################################################################################
495
- #
496
- # Ebook Chapter Chunking
497
-
498
-
499
- def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
500
- max_chunk_size = chunk_options.get('max_size', 300)
501
- overlap = chunk_options.get('overlap', 0)
502
- custom_pattern = chunk_options.get('custom_chapter_pattern', None)
503
-
504
- # List of chapter heading patterns to try, in order
505
- chapter_patterns = [
506
- custom_pattern,
507
- r'^#{1,2}\s+', # Markdown style: '# ' or '## '
508
- r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
509
- r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
510
- r'^[A-Z\s]+$' # All caps headings
511
- ]
512
-
513
- chapter_positions = []
514
- used_pattern = None
515
-
516
- for pattern in chapter_patterns:
517
- if pattern is None:
518
- continue
519
- chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
520
- chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
521
- if chapter_positions:
522
- used_pattern = pattern
523
- break
524
-
525
- # If no chapters found, return the entire content as one chunk
526
- if not chapter_positions:
527
- return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}]
528
-
529
- # Split content into chapters
530
- chunks = []
531
- for i in range(len(chapter_positions)):
532
- start = chapter_positions[i]
533
- end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
534
- chapter = text[start:end]
535
-
536
- # Apply overlap if specified
537
- if overlap > 0 and i > 0:
538
- overlap_start = max(0, start - overlap)
539
- chapter = text[overlap_start:end]
540
-
541
- chunks.append(chapter)
542
-
543
- # Post-process chunks
544
- processed_chunks = post_process_chunks(chunks)
545
-
546
- # Add metadata to chunks
547
- return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1,
548
- chapter_pattern=used_pattern)}
549
- for i, chunk in enumerate(processed_chunks)]
550
-
551
-
552
- # # Example usage
553
- # if __name__ == "__main__":
554
- # sample_ebook_content = """
555
- # # Chapter 1: Introduction
556
- #
557
- # This is the introduction.
558
- #
559
- # ## Section 1.1
560
- #
561
- # Some content here.
562
- #
563
- # # Chapter 2: Main Content
564
- #
565
- # This is the main content.
566
- #
567
- # ## Section 2.1
568
- #
569
- # More content here.
570
- #
571
- # CHAPTER THREE
572
- #
573
- # This is the third chapter.
574
- #
575
- # 4. Fourth Chapter
576
- #
577
- # This is the fourth chapter.
578
- # """
579
- #
580
- # chunk_options = {
581
- # 'method': 'chapters',
582
- # 'max_size': 500,
583
- # 'overlap': 50,
584
- # 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style
585
- # }
586
- #
587
- # chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options)
588
- #
589
- # for i, chunk in enumerate(chunked_chapters, 1):
590
- # print(f"Chunk {i}:")
591
- # print(chunk['text'])
592
- # print(f"Metadata: {chunk['metadata']}\n")
593
-
594
-
595
-
596
-
597
- #
598
- # End of Chunking Library
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  #######################################################################################################################
 
1
+ # Chunk_Lib.py
2
+ #########################################
3
+ # Chunking Library
4
+ # This library is used to perform chunking of input files.
5
+ # Currently, uses naive approaches. Nothing fancy.
6
+ #
7
+ ####
8
+ # Import necessary libraries
9
+ import hashlib
10
+ import logging
11
+ import re
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+ #
14
+ # Import 3rd party
15
+ from openai import OpenAI
16
+ from tqdm import tqdm
17
+ from langdetect import detect
18
+ from transformers import GPT2Tokenizer
19
+ import nltk
20
+ from nltk.tokenize import sent_tokenize, word_tokenize
21
+ from sklearn.feature_extraction.text import TfidfVectorizer
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+ #
24
+ # Import Local
25
+ from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
26
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
27
+ #
28
+ #######################################################################################################################
29
+ # Config Settings
30
+ #
31
+ #
32
+ # FIXME - Make sure it only downloads if it already exists, and does a check first.
33
+ # Ensure NLTK data is downloaded
34
+ def ntlk_prep():
35
+ nltk.download('punkt')
36
+ #
37
+ # Load GPT2 tokenizer
38
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
39
+ #
40
+ # Load configuration
41
+ config = load_comprehensive_config()
42
+ # Embedding Chunking options
43
+ chunk_options = {
44
+ 'method': config.get('Chunking', 'method', fallback='words'),
45
+ 'max_size': config.getint('Chunking', 'max_size', fallback=400),
46
+ 'overlap': config.getint('Chunking', 'overlap', fallback=200),
47
+ 'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
48
+ 'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
49
+ 'language': config.get('Chunking', 'language', fallback='english')
50
+ }
51
+
52
+ openai_api_key = config.get('API', 'openai_api_key')
53
+ #
54
+ # End of settings
55
+ #######################################################################################################################
56
+ #
57
+ # Functions:
58
+
59
+ def detect_language(text: str) -> str:
60
+ try:
61
+ return detect(text)
62
+ except:
63
+ # Default to English if detection fails
64
+ return 'en'
65
+
66
+
67
+ def load_document(file_path):
68
+ with open(file_path, 'r') as file:
69
+ text = file.read()
70
+ return re.sub('\\s+', ' ', text).strip()
71
+
72
+
73
+ def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
74
+ logging.debug("Improved chunking process started...")
75
+ options = chunk_options.copy()
76
+ if custom_chunk_options:
77
+ options.update(custom_chunk_options)
78
+
79
+ chunk_method = options.get('method', 'words')
80
+ base_size = options.get('base_size', 1000)
81
+ min_size = options.get('min_size', 100)
82
+ max_size = options.get('max_size', 2000)
83
+ overlap = options.get('overlap', 0)
84
+ language = options.get('language', None)
85
+ adaptive = options.get('adaptive', False)
86
+ multi_level = options.get('multi_level', False)
87
+
88
+ if language is None:
89
+ language = detect_language(text)
90
+
91
+ if adaptive:
92
+ max_chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size)
93
+ else:
94
+ max_chunk_size = base_size
95
+
96
+ if multi_level:
97
+ chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
98
+ else:
99
+ chunks = chunk_text(text, chunk_method, max_chunk_size, overlap, language)
100
+
101
+ chunks_with_metadata = []
102
+ for i, chunk in enumerate(chunks):
103
+ metadata = get_chunk_metadata(
104
+ chunk,
105
+ text,
106
+ chunk_type=chunk_method,
107
+ language=language
108
+ )
109
+ metadata['chunk_index'] = i
110
+ metadata['total_chunks'] = len(chunks)
111
+
112
+ chunks_with_metadata.append({
113
+ 'text': chunk,
114
+ 'metadata': metadata
115
+ })
116
+
117
+ return chunks_with_metadata
118
+
119
+
120
+ def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
121
+ logging.debug("Multi-level chunking process started...")
122
+ # First level: chunk by paragraphs
123
+ paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
124
+
125
+ # Second level: chunk each paragraph further
126
+ chunks = []
127
+ for para in paragraphs:
128
+ if method == 'words':
129
+ chunks.extend(chunk_text_by_words(para, max_size, overlap, language))
130
+ elif method == 'sentences':
131
+ chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
132
+ else:
133
+ chunks.append(para)
134
+
135
+ return chunks
136
+
137
+
138
+
139
+ # FIXME - ensure language detection occurs in each chunk function
140
+ def chunk_text(text: str, method: str, max_size: int, overlap: int, language: str=None) -> List[str]:
141
+
142
+ if method == 'words':
143
+ logging.debug("Chunking by words...")
144
+ return chunk_text_by_words(text, max_size, overlap, language)
145
+ elif method == 'sentences':
146
+ logging.debug("Chunking by sentences...")
147
+ return chunk_text_by_sentences(text, max_size, overlap, language)
148
+ elif method == 'paragraphs':
149
+ logging.debug("Chunking by paragraphs...")
150
+ return chunk_text_by_paragraphs(text, max_size, overlap)
151
+ elif method == 'tokens':
152
+ logging.debug("Chunking by tokens...")
153
+ return chunk_text_by_tokens(text, max_size, overlap)
154
+ elif method == 'semantic':
155
+ logging.debug("Chunking by semantic similarity...")
156
+ return semantic_chunking(text, max_size)
157
+ else:
158
+ return [text]
159
+
160
+ def determine_chunk_position(relative_position: float) -> str:
161
+ if relative_position < 0.33:
162
+ return "This chunk is from the beginning of the document"
163
+ elif relative_position < 0.66:
164
+ return "This chunk is from the middle of the document"
165
+ else:
166
+ return "This chunk is from the end of the document"
167
+
168
+
169
+ def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0, language: str = None) -> List[str]:
170
+ logging.debug("chunk_text_by_words...")
171
+ if language is None:
172
+ language = detect_language(text)
173
+
174
+ if language.startswith('zh'): # Chinese
175
+ import jieba
176
+ words = list(jieba.cut(text))
177
+ elif language == 'ja': # Japanese
178
+ import fugashi
179
+ tagger = fugashi.Tagger()
180
+ words = [word.surface for word in tagger(text)]
181
+ else: # Default to simple splitting for other languages
182
+ words = text.split()
183
+
184
+ chunks = []
185
+ for i in range(0, len(words), max_words - overlap):
186
+ chunk = ' '.join(words[i:i + max_words])
187
+ chunks.append(chunk)
188
+ return post_process_chunks(chunks)
189
+
190
+
191
+ def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = None) -> List[str]:
192
+ logging.debug("chunk_text_by_sentences...")
193
+ if language is None:
194
+ language = detect_language(text)
195
+
196
+ nltk.download('punkt', quiet=True)
197
+
198
+ if language.startswith('zh'): # Chinese
199
+ import jieba
200
+ sentences = list(jieba.cut(text, cut_all=False))
201
+ elif language == 'ja': # Japanese
202
+ import fugashi
203
+ tagger = fugashi.Tagger()
204
+ sentences = [word.surface for word in tagger(text) if word.feature.pos1 in ['記号', '補助記号'] and word.surface.strip()]
205
+ else: # Default to NLTK for other languages
206
+ sentences = sent_tokenize(text, language=language)
207
+
208
+ chunks = []
209
+ for i in range(0, len(sentences), max_sentences - overlap):
210
+ chunk = ' '.join(sentences[i:i + max_sentences])
211
+ chunks.append(chunk)
212
+ return post_process_chunks(chunks)
213
+
214
+
215
+ def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
216
+ logging.debug("chunk_text_by_paragraphs...")
217
+ paragraphs = re.split(r'\n\s*\n', text)
218
+ chunks = []
219
+ for i in range(0, len(paragraphs), max_paragraphs - overlap):
220
+ chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
221
+ chunks.append(chunk)
222
+ return post_process_chunks(chunks)
223
+
224
+
225
+ def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
226
+ logging.debug("chunk_text_by_tokens...")
227
+ # This is a simplified token-based chunking. For more accurate tokenization,
228
+ # consider using a proper tokenizer like GPT-2 TokenizerFast
229
+ words = text.split()
230
+ chunks = []
231
+ current_chunk = []
232
+ current_token_count = 0
233
+
234
+ for word in words:
235
+ word_token_count = len(word) // 4 + 1 # Rough estimate of token count
236
+ if current_token_count + word_token_count > max_tokens and current_chunk:
237
+ chunks.append(' '.join(current_chunk))
238
+ current_chunk = current_chunk[-overlap:] if overlap > 0 else []
239
+ current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
240
+
241
+ current_chunk.append(word)
242
+ current_token_count += word_token_count
243
+
244
+ if current_chunk:
245
+ chunks.append(' '.join(current_chunk))
246
+
247
+ return post_process_chunks(chunks)
248
+
249
+
250
+ def post_process_chunks(chunks: List[str]) -> List[str]:
251
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
252
+
253
+
254
+ # FIXME - F
255
+ def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic",
256
+ chapter_number: Optional[int] = None,
257
+ chapter_pattern: Optional[str] = None,
258
+ language: str = None) -> Dict[str, Any]:
259
+ try:
260
+ logging.debug("get_chunk_metadata...")
261
+ start_index = full_text.index(chunk)
262
+ end_index = start_index + len(chunk)
263
+ # Calculate a hash for the chunk
264
+ chunk_hash = hashlib.md5(chunk.encode()).hexdigest()
265
+
266
+ metadata = {
267
+ 'start_index': start_index,
268
+ 'end_index': end_index,
269
+ 'word_count': len(chunk.split()),
270
+ 'char_count': len(chunk),
271
+ 'chunk_type': chunk_type,
272
+ 'language': language,
273
+ 'chunk_hash': chunk_hash,
274
+ 'relative_position': start_index / len(full_text)
275
+ }
276
+
277
+ if chunk_type == "chapter":
278
+ metadata['chapter_number'] = chapter_number
279
+ metadata['chapter_pattern'] = chapter_pattern
280
+
281
+ return metadata
282
+ except ValueError as e:
283
+ logging.error(f"Chunk not found in full_text: {chunk[:50]}... Full text length: {len(full_text)}")
284
+ raise
285
+
286
+
287
+ def process_document_with_metadata(text: str, chunk_options: Dict[str, Any],
288
+ document_metadata: Dict[str, Any]) -> Dict[str, Any]:
289
+ chunks = improved_chunking_process(text, chunk_options)
290
+
291
+ return {
292
+ 'document_metadata': document_metadata,
293
+ 'chunks': chunks
294
+ }
295
+
296
+
297
+ # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
298
+ def chunk_text_hybrid(text, max_tokens=1000):
299
+ logging.debug("chunk_text_hybrid...")
300
+ sentences = nltk.tokenize.sent_tokenize(text)
301
+ chunks = []
302
+ current_chunk = []
303
+ current_length = 0
304
+
305
+ for sentence in sentences:
306
+ tokens = tokenizer.encode(sentence)
307
+ if current_length + len(tokens) <= max_tokens:
308
+ current_chunk.append(sentence)
309
+ current_length += len(tokens)
310
+ else:
311
+ chunks.append(' '.join(current_chunk))
312
+ current_chunk = [sentence]
313
+ current_length = len(tokens)
314
+
315
+ if current_chunk:
316
+ chunks.append(' '.join(current_chunk))
317
+
318
+ return chunks
319
+
320
+
321
+ # Thanks openai
322
+ def chunk_on_delimiter(input_string: str,
323
+ max_tokens: int,
324
+ delimiter: str) -> List[str]:
325
+ logging.debug("chunk_on_delimiter...")
326
+ chunks = input_string.split(delimiter)
327
+ combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
328
+ chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
329
+ if dropped_chunk_count > 0:
330
+ print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
331
+ combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
332
+ return combined_chunks
333
+
334
+
335
+
336
+
337
+ # ????FIXME
338
+ def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None):
339
+ logging.debug("recursive_summarize_chunks...")
340
+ summarized_chunks = []
341
+ current_summary = ""
342
+
343
+ logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...")
344
+ logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}")
345
+ for i, chunk in enumerate(chunks):
346
+ if i == 0:
347
+ current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
348
+ else:
349
+ combined_text = current_summary + "\n\n" + chunk
350
+ current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
351
+
352
+ summarized_chunks.append(current_summary)
353
+
354
+ return summarized_chunks
355
+
356
+
357
+ # Sample text for testing
358
+ sample_text = """
359
+ Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
360
+ concerned with the interactions between computers and human language, in particular how to program computers
361
+ to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
362
+ the contents of documents, including the contextual nuances of the language within them. The technology can then
363
+ accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
364
+
365
+ Challenges in natural language processing frequently involve speech recognition, natural language understanding,
366
+ and natural language generation.
367
+
368
+ Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
369
+ "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
370
+ """
371
+
372
+ # Example usage of different chunking methods
373
+ # print("Chunking by words:")
374
+ # print(chunk_text_by_words(sample_text, max_words=50))
375
+ #
376
+ # print("\nChunking by sentences:")
377
+ # print(chunk_text_by_sentences(sample_text, max_sentences=2))
378
+ #
379
+ # print("\nChunking by paragraphs:")
380
+ # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
381
+ #
382
+ # print("\nChunking by tokens:")
383
+ # print(chunk_text_by_tokens(sample_text, max_tokens=50))
384
+ #
385
+ # print("\nHybrid chunking:")
386
+ # print(chunk_text_hybrid(sample_text, max_tokens=50))
387
+
388
+
389
+
390
+ #######################################################################################################################
391
+ #
392
+ # Experimental Semantic Chunking
393
+ #
394
+
395
+ # Chunk text into segments based on semantic similarity
396
+ def count_units(text, unit='words'):
397
+ if unit == 'words':
398
+ return len(text.split())
399
+ elif unit == 'tokens':
400
+ return len(word_tokenize(text))
401
+ elif unit == 'characters':
402
+ return len(text)
403
+ else:
404
+ raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
405
+
406
+
407
+ def semantic_chunking(text, max_chunk_size=2000, unit='words'):
408
+ logging.debug("semantic_chunking...")
409
+ nltk.download('punkt', quiet=True)
410
+ sentences = sent_tokenize(text)
411
+ vectorizer = TfidfVectorizer()
412
+ sentence_vectors = vectorizer.fit_transform(sentences)
413
+
414
+ chunks = []
415
+ current_chunk = []
416
+ current_size = 0
417
+
418
+ for i, sentence in enumerate(sentences):
419
+ sentence_size = count_units(sentence, unit)
420
+ if current_size + sentence_size > max_chunk_size and current_chunk:
421
+ chunks.append(' '.join(current_chunk))
422
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
423
+ current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
424
+ current_size = overlap_size
425
+
426
+ current_chunk.append(sentence)
427
+ current_size += sentence_size
428
+
429
+ if i + 1 < len(sentences):
430
+ current_vector = sentence_vectors[i]
431
+ next_vector = sentence_vectors[i + 1]
432
+ similarity = cosine_similarity(current_vector, next_vector)[0][0]
433
+ if similarity < 0.5 and current_size >= max_chunk_size // 2:
434
+ chunks.append(' '.join(current_chunk))
435
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
436
+ current_chunk = current_chunk[-3:]
437
+ current_size = overlap_size
438
+
439
+ if current_chunk:
440
+ chunks.append(' '.join(current_chunk))
441
+
442
+ return chunks
443
+
444
+
445
+ def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100, unit='words'):
446
+ logging.debug("semantic_chunk_long_file...")
447
+ try:
448
+ with open(file_path, 'r', encoding='utf-8') as file:
449
+ content = file.read()
450
+
451
+ chunks = semantic_chunking(content, max_chunk_size, unit)
452
+ return chunks
453
+ except Exception as e:
454
+ logging.error(f"Error chunking text file: {str(e)}")
455
+ return None
456
+
457
+ #
458
+ #
459
+ #######################################################################################################################
460
+
461
+
462
+ #######################################################################################################################
463
+ #
464
+ # Embedding Chunking
465
+
466
+ def chunk_for_embedding(text: str, file_name: str, full_summary: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
467
+ options = chunk_options.copy()
468
+ if custom_chunk_options:
469
+ options.update(custom_chunk_options)
470
+
471
+ chunks = improved_chunking_process(text, options)
472
+ total_chunks = len(chunks)
473
+
474
+ chunked_text_with_headers = []
475
+ for i, chunk in enumerate(chunks, 1):
476
+ chunk_text = chunk['text']
477
+ chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
478
+
479
+ chunk_header = f"""
480
+ Original Document: {file_name}
481
+ Full Document Summary: {full_summary or "Full document summary not available."}
482
+ Chunk: {i} of {total_chunks}
483
+ Position: {chunk_position}
484
+
485
+ --- Chunk Content ---
486
+ """
487
+
488
+ full_chunk_text = chunk_header + chunk_text
489
+ chunk['text'] = full_chunk_text
490
+ chunk['metadata']['file_name'] = file_name
491
+ chunked_text_with_headers.append(chunk)
492
+
493
+ return chunked_text_with_headers
494
+
495
+ #
496
+ # End of Embedding Chunking
497
+ #######################################################################################################################
498
+
499
+
500
+ #######################################################################################################################
501
+ #
502
+ # OpenAI Rolling Summarization
503
+ #
504
+
505
+ client = OpenAI(api_key=openai_api_key)
506
+ def get_chat_completion(messages, model='gpt-4-turbo'):
507
+ response = client.chat.completions.create(
508
+ model=model,
509
+ messages=messages,
510
+ temperature=0,
511
+ )
512
+ return response.choices[0].message.content
513
+
514
+
515
+ # This function combines text chunks into larger blocks without exceeding a specified token count.
516
+ # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
517
+ def combine_chunks_with_no_minimum(
518
+ chunks: List[str],
519
+ max_tokens: int,
520
+ chunk_delimiter="\n\n",
521
+ header: Optional[str] = None,
522
+ add_ellipsis_for_overflow=False,
523
+ ) -> Tuple[List[str], List[List[int]], int]:
524
+ dropped_chunk_count = 0
525
+ output = [] # list to hold the final combined chunks
526
+ output_indices = [] # list to hold the indices of the final combined chunks
527
+ candidate = (
528
+ [] if header is None else [header]
529
+ ) # list to hold the current combined chunk candidate
530
+ candidate_indices = []
531
+ for chunk_i, chunk in enumerate(chunks):
532
+ chunk_with_header = [chunk] if header is None else [header, chunk]
533
+ # FIXME MAKE NOT OPENAI SPECIFIC
534
+ if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
535
+ print(f"warning: chunk overflow")
536
+ if (
537
+ add_ellipsis_for_overflow
538
+ # FIXME MAKE NOT OPENAI SPECIFIC
539
+ and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
540
+ ):
541
+ candidate.append("...")
542
+ dropped_chunk_count += 1
543
+ continue # this case would break downstream assumptions
544
+ # estimate token count with the current chunk added
545
+ # FIXME MAKE NOT OPENAI SPECIFIC
546
+ extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
547
+ # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
548
+ if extended_candidate_token_count > max_tokens:
549
+ output.append(chunk_delimiter.join(candidate))
550
+ output_indices.append(candidate_indices)
551
+ candidate = chunk_with_header # re-initialize candidate
552
+ candidate_indices = [chunk_i]
553
+ # otherwise keep extending the candidate
554
+ else:
555
+ candidate.append(chunk)
556
+ candidate_indices.append(chunk_i)
557
+ # add the remaining candidate to output if it's not empty
558
+ if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
559
+ output.append(chunk_delimiter.join(candidate))
560
+ output_indices.append(candidate_indices)
561
+ return output, output_indices, dropped_chunk_count
562
+
563
+
564
+ def rolling_summarize(text: str,
565
+ detail: float = 0,
566
+ model: str = 'gpt-4-turbo',
567
+ additional_instructions: Optional[str] = None,
568
+ minimum_chunk_size: Optional[int] = 500,
569
+ chunk_delimiter: str = ".",
570
+ summarize_recursively=False,
571
+ verbose=False):
572
+ """
573
+ Summarizes a given text by splitting it into chunks, each of which is summarized individually.
574
+ The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
575
+
576
+ Parameters:
577
+ - text (str): The text to be summarized.
578
+ - detail (float, optional): A value between 0 and 1
579
+ indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
580
+ detailed summary. Defaults to 0.
581
+ - additional_instructions (Optional[str], optional): Additional instructions to provide to the
582
+ model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
583
+ chunks. Defaults to 500.
584
+ - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
585
+ - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
586
+ - verbose (bool, optional): If True, prints detailed information about the chunking process.
587
+ Returns:
588
+ - str: The final compiled summary of the text.
589
+
590
+ The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
591
+ based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
592
+ `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
593
+ summarization process. The function returns a compiled summary of all chunks.
594
+ """
595
+
596
+ # check detail is set correctly
597
+ assert 0 <= detail <= 1
598
+
599
+ # interpolate the number of chunks based to get specified level of detail
600
+ max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
601
+ min_chunks = 1
602
+ num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
603
+
604
+ # adjust chunk_size based on interpolated number of chunks
605
+ # FIXME MAKE NOT OPENAI SPECIFIC
606
+ document_length = len(openai_tokenize(text))
607
+ chunk_size = max(minimum_chunk_size, document_length // num_chunks)
608
+ text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
609
+ if verbose:
610
+ print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
611
+ # FIXME MAKE NOT OPENAI SPECIFIC
612
+ print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
613
+
614
+ # set system message - FIXME
615
+ system_message_content = "Rewrite this text in summarized form."
616
+ if additional_instructions is not None:
617
+ system_message_content += f"\n\n{additional_instructions}"
618
+
619
+ accumulated_summaries = []
620
+ for i, chunk in enumerate(tqdm(text_chunks)):
621
+ if summarize_recursively and accumulated_summaries:
622
+ # Combine previous summary with current chunk for recursive summarization
623
+ combined_text = accumulated_summaries[-1] + "\n\n" + chunk
624
+ user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
625
+ else:
626
+ user_message_content = chunk
627
+
628
+ messages = [
629
+ {"role": "system", "content": system_message_content},
630
+ {"role": "user", "content": user_message_content}
631
+ ]
632
+
633
+ response = get_chat_completion(messages, model=model)
634
+ accumulated_summaries.append(response)
635
+
636
+ final_summary = '\n\n'.join(accumulated_summaries)
637
+ return final_summary
638
+
639
+ #
640
+ #
641
+ #######################################################################################################################
642
+ #
643
+ # Ebook Chapter Chunking
644
+
645
+
646
+ def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
647
+ logging.debug("chunk_ebook_by_chapters")
648
+ max_chunk_size = chunk_options.get('max_size', 300)
649
+ overlap = chunk_options.get('overlap', 0)
650
+ custom_pattern = chunk_options.get('custom_chapter_pattern', None)
651
+
652
+ # List of chapter heading patterns to try, in order
653
+ chapter_patterns = [
654
+ custom_pattern,
655
+ r'^#{1,2}\s+', # Markdown style: '# ' or '## '
656
+ r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
657
+ r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
658
+ r'^[A-Z\s]+$' # All caps headings
659
+ ]
660
+
661
+ chapter_positions = []
662
+ used_pattern = None
663
+
664
+ for pattern in chapter_patterns:
665
+ if pattern is None:
666
+ continue
667
+ chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
668
+ chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
669
+ if chapter_positions:
670
+ used_pattern = pattern
671
+ break
672
+
673
+ # If no chapters found, return the entire content as one chunk
674
+ if not chapter_positions:
675
+ return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}]
676
+
677
+ # Split content into chapters
678
+ chunks = []
679
+ for i in range(len(chapter_positions)):
680
+ start = chapter_positions[i]
681
+ end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
682
+ chapter = text[start:end]
683
+
684
+ # Apply overlap if specified
685
+ if overlap > 0 and i > 0:
686
+ overlap_start = max(0, start - overlap)
687
+ chapter = text[overlap_start:end]
688
+
689
+ chunks.append(chapter)
690
+
691
+ # Post-process chunks
692
+ processed_chunks = post_process_chunks(chunks)
693
+
694
+ # Add metadata to chunks
695
+ return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1,
696
+ chapter_pattern=used_pattern)}
697
+ for i, chunk in enumerate(processed_chunks)]
698
+
699
+
700
+ # # Example usage
701
+ # if __name__ == "__main__":
702
+ # sample_ebook_content = """
703
+ # # Chapter 1: Introduction
704
+ #
705
+ # This is the introduction.
706
+ #
707
+ # ## Section 1.1
708
+ #
709
+ # Some content here.
710
+ #
711
+ # # Chapter 2: Main Content
712
+ #
713
+ # This is the main content.
714
+ #
715
+ # ## Section 2.1
716
+ #
717
+ # More content here.
718
+ #
719
+ # CHAPTER THREE
720
+ #
721
+ # This is the third chapter.
722
+ #
723
+ # 4. Fourth Chapter
724
+ #
725
+ # This is the fourth chapter.
726
+ # """
727
+ #
728
+ # chunk_options = {
729
+ # 'method': 'chapters',
730
+ # 'max_size': 500,
731
+ # 'overlap': 50,
732
+ # 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style
733
+ # }
734
+ #
735
+ # chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options)
736
+ #
737
+ # for i, chunk in enumerate(chunked_chapters, 1):
738
+ # print(f"Chunk {i}:")
739
+ # print(chunk['text'])
740
+ # print(f"Metadata: {chunk['metadata']}\n")
741
+
742
+ #
743
+ # End of ebook chapter chunking
744
+ #######################################################################################################################
745
+
746
+ #######################################################################################################################
747
+ #
748
+ # Functions for adapative chunking:
749
+
750
+ # FIXME - punkt
751
+ def adaptive_chunk_size(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> int:
752
+ # Ensure NLTK data is downloaded
753
+ nltk.download('punkt', quiet=True)
754
+
755
+ # Tokenize the text into sentences
756
+ sentences = sent_tokenize(text)
757
+
758
+ # Calculate average sentence length
759
+ avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
760
+
761
+ # Adjust chunk size based on average sentence length
762
+ if avg_sentence_length < 10:
763
+ size_factor = 1.2 # Increase chunk size for short sentences
764
+ elif avg_sentence_length > 20:
765
+ size_factor = 0.8 # Decrease chunk size for long sentences
766
+ else:
767
+ size_factor = 1.0
768
+
769
+ # Calculate adaptive chunk size
770
+ adaptive_size = int(base_size * size_factor)
771
+
772
+ # Ensure chunk size is within bounds
773
+ return max(min_size, min(adaptive_size, max_size))
774
+
775
+
776
+ def adaptive_chunk_size_non_punkt(text: str, base_size: int, min_size: int = 100, max_size: int = 2000) -> int:
777
+ # Adaptive logic: adjust chunk size based on text complexity
778
+ words = text.split()
779
+ if not words:
780
+ return base_size # Return base_size if text is empty
781
+
782
+ avg_word_length = sum(len(word) for word in words) / len(words)
783
+
784
+ if avg_word_length > 6: # Threshold for "complex" text
785
+ adjusted_size = int(base_size * 0.8) # Reduce chunk size for complex text
786
+ elif avg_word_length < 4: # Threshold for "simple" text
787
+ adjusted_size = int(base_size * 1.2) # Increase chunk size for simple text
788
+ else:
789
+ adjusted_size = base_size
790
+
791
+ # Ensure the chunk size is within the specified range
792
+ return max(min_size, min(adjusted_size, max_size))
793
+
794
+
795
+ def adaptive_chunking(text: str, base_size: int = 1000, min_size: int = 500, max_size: int = 2000) -> List[str]:
796
+ logging.debug("adaptive_chunking...")
797
+ chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size)
798
+ words = text.split()
799
+ chunks = []
800
+ current_chunk = []
801
+ current_length = 0
802
+
803
+ for word in words:
804
+ if current_length + len(word) > chunk_size and current_chunk:
805
+ chunks.append(' '.join(current_chunk))
806
+ current_chunk = []
807
+ current_length = 0
808
+ current_chunk.append(word)
809
+ current_length += len(word) + 1 # +1 for space
810
+
811
+ if current_chunk:
812
+ chunks.append(' '.join(current_chunk))
813
+
814
+ return chunks
815
+
816
+ # FIXME - usage example
817
+ # chunk_options = {
818
+ # 'method': 'words', # or any other method
819
+ # 'base_size': 1000,
820
+ # 'min_size': 100,
821
+ # 'max_size': 2000,
822
+ # 'adaptive': True,
823
+ # 'language': 'en'
824
+ # }
825
+ #chunks = improved_chunking_process(your_text, chunk_options)
826
+
827
+
828
+ # Example of chunking a document with metadata
829
+ # document_metadata = {
830
+ # 'title': 'Example Document',
831
+ # 'author': 'John Doe',
832
+ # 'creation_date': '2023-06-14',
833
+ # 'source': 'https://example.com/document',
834
+ # 'document_type': 'article'
835
+ # }
836
+ #
837
+ # chunk_options = {
838
+ # 'method': 'sentences',
839
+ # 'base_size': 1000,
840
+ # 'adaptive': True,
841
+ # 'language': 'en'
842
+ # }
843
+ #
844
+ # processed_document = process_document_with_metadata(your_text, chunk_options, document_metadata)
845
+
846
+
847
+ #
848
+ # End of Chunking Library
849
  #######################################################################################################################