Atanu Sarkar commited on
Commit
694a076
·
unverified ·
2 Parent(s): 49d583d 24a271d

Merge pull request #11 from soumik12345/feat/semantic-chunking

Browse files
docs/chunking.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Chunking
2
+
3
+ ::: medrag_multi_modal.semantic_chunking
medrag_multi_modal/{semantic_chunker.py → semantic_chunking.py} RENAMED
@@ -17,6 +17,42 @@ TOKENIZER_OR_TOKEN_COUNTER = Union[
17
 
18
 
19
  class SemanticChunker:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def __init__(
21
  self,
22
  tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
 
17
 
18
 
19
  class SemanticChunker:
20
+ """
21
+ SemanticChunker is a class that chunks documents into smaller segments and
22
+ publishes them as datasets.
23
+
24
+ This class uses the `semchunk` library to break down large documents into
25
+ smaller, manageable chunks based on a specified tokenizer or token counter.
26
+ This is particularly useful for processing large text datasets where
27
+ smaller segments are needed for analysis or other operations.
28
+
29
+ !!! example "Example Usage"
30
+ ```python
31
+ import weave
32
+ from dotenv import load_dotenv
33
+
34
+ from medrag_multi_modal.semantic_chunking import SemanticChunker
35
+
36
+ load_dotenv()
37
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
38
+ chunker = SemanticChunker(chunk_size=256)
39
+ chunker.chunk_and_publish(
40
+ document_dataset_name="grays-anatomy-text:v13",
41
+ chunk_dataset_name="grays-anatomy-chunks",
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ tokenizer_or_token_counter (TOKENIZER_OR_TOKEN_COUNTER): The tokenizer or
47
+ token counter to be used for chunking.
48
+ chunk_size (Optional[int]): The size of each chunk. If not specified, the
49
+ default chunk size from `semchunk` will be used.
50
+ max_token_chars (Optional[int]): The maximum number of characters per token.
51
+ If not specified, the default value from `semchunk` will be used.
52
+ memoize (bool): Whether to memoize the chunking process for efficiency.
53
+ Default is True.
54
+ """
55
+
56
  def __init__(
57
  self,
58
  tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
mkdocs.yml CHANGED
@@ -70,6 +70,7 @@ nav:
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
  - Image Loader: 'document_loader/load_image.md'
 
73
  - Retrieval:
74
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
75
 
 
70
  - PDFPlumber: 'document_loader/text_loader/pdfplumber_text_loader.md'
71
  - Marker: 'document_loader/text_loader/marker_text_loader.md'
72
  - Image Loader: 'document_loader/load_image.md'
73
+ - Chunking: 'chunking.md'
74
  - Retrieval:
75
  - Multi-Modal Retrieval: 'retreival/multi_modal_retrieval.md'
76