File size: 3,308 Bytes
132b0ec
 
5534eb0
 
63045b3
 
 
 
 
 
132b0ec
5534eb0
132b0ec
 
 
 
5534eb0
132b0ec
5534eb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132b0ec
 
5534eb0
132b0ec
 
 
 
 
 
 
5534eb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
from unidecode import unidecode
from nltk import sent_tokenize

# from transformers import AutoTokenizer
# import yaml
# import fitz
# import requests
# from bs4 import BeautifulSoup
# from collections import defaultdict


def remove_accents(input_str):
    text_no_accents = unidecode(input_str)
    return text_no_accents


def remove_special_characters(text):
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub("", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r'[^\w\s\d.,!?\'"()-;]', "", text)
    text = re.sub(r"\s+([.,!?;])", r"\1", text)
    text = re.sub(r"([.,!?;])(\S)", r"\1 \2", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def remove_special_characters_2(text):
    pattern = r"[^a-zA-Z0-9 ]+"
    text = re.sub(pattern, "", text)
    return text


def split_into_sentences(text):
    sentences = re.split(r"(?<=[.!?]) +", text)
    return sentences


def get_token_length(tokenizer, sentence):
    return len(tokenizer.tokenize(sentence))


MC_TOKEN_SIZE = 256
BC_TOKEN_SIZE = 333


def split_text_allow_complete_sentences_nltk(text, type_det="bc", tokenizer=None):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0
    if type_det == "bc":
        max_tokens = BC_TOKEN_SIZE
    elif type_det == "mc":
        max_tokens = MC_TOKEN_SIZE
    elif type_det == "quillbot":
        max_tokens = 256

    def add_sentence_to_chunk(sentence):
        nonlocal current_chunk, current_length
        sentence_length = get_token_length(tokenizer, sentence)
        if current_length + sentence_length > max_tokens:
            chunks.append((current_chunk, current_length))
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += sentence_length

    for sentence in sentences:
        add_sentence_to_chunk(sentence)
    if current_chunk:
        chunks.append((current_chunk, current_length))
    adjusted_chunks = []
    while chunks:
        chunk = chunks.pop(0)
        if len(chunks) > 0 and chunk[1] < max_tokens / 2:
            next_chunk = chunks.pop(0)
            combined_length = chunk[1] + next_chunk[1]
            if combined_length <= max_tokens:
                adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
            else:
                adjusted_chunks.append(chunk)
                chunks.insert(0, next_chunk)
        else:
            adjusted_chunks.append(chunk)
    result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
    return result_chunks