File size: 2,028 Bytes
a9082f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from typing import TypedDict, List
from tiktoken import Encoding


class Row(TypedDict):
  id: str
  title: str
  url: str
  text: str


def split_token(encoder: Encoding, rows: List[Row], input_texts: List[str], split: int = 512) -> List[Row]:
  dict_list: List[Row] = []

  # Batch documents
  for i, text_tokenes in enumerate(encoder.encode_batch(input_texts)):
    row = rows[i]
    passages_count = int((len(text_tokenes) - 1) / split)

    # Passages from start
    for i in range(passages_count):
      tokens = text_tokenes[i * split:(i + 1) * split]
      for i in range(passages_count):
        tokens = text_tokenes[i * split:(i + 1) * split]

        # Append tokens until meet whitespace
        for token in text_tokenes[(i + 1) * split:]:
          if not encoder.decode_single_token_bytes(token).startswith(b' '):
            tokens.append(token)
          else:
            break

        # Unshift tokens until meet whitespace
        if not encoder.decode_single_token_bytes(text_tokenes[i * split]).startswith(b' '):
          for token in reversed(text_tokenes[:i * split]):
            if not encoder.decode_single_token_bytes(token).startswith(b' '):
              tokens.insert(0, token)
            else:
              tokens.insert(0, token)
              break
        dict_list.append({'id': f"{row['id']}_{i}", 'title': row['title'], 'url': row['url'],
                          'text': encoder.decode(tokens)})

    # Passages from end
    tokens = text_tokenes[-split:]
    if not encoder.decode_single_token_bytes(text_tokenes[0]).startswith(b' '):

      # Unshift tokens until meet whitespace
      for token in reversed(text_tokenes[:-split]):
        if not encoder.decode_single_token_bytes(token).startswith(b' '):
          tokens.insert(0, token)
        else:
          tokens.insert(0, token)
          break
    dict_list.append({'id': f"{row['id']}_{passages_count}", 'title': row['title'], 'url': row['url'],
                      'text': encoder.decode(tokens)})
  return dict_list