Spaces:
Runtime error
Runtime error
import re | |
from unidecode import unidecode | |
from nltk import sent_tokenize | |
# from transformers import AutoTokenizer | |
# import yaml | |
# import fitz | |
# import requests | |
# from bs4 import BeautifulSoup | |
# from collections import defaultdict | |
def remove_accents(input_str): | |
text_no_accents = unidecode(input_str) | |
return text_no_accents | |
def remove_special_characters(text): | |
text = re.sub(r"https?://\S+|www\.\S+", "", text) | |
emoji_pattern = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # emoticons | |
"\U0001F300-\U0001F5FF" # symbols & pictographs | |
"\U0001F680-\U0001F6FF" # transport & map symbols | |
"\U0001F700-\U0001F77F" # alchemical symbols | |
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended | |
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
"\U0001FA00-\U0001FA6F" # Chess Symbols | |
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
"\U00002702-\U000027B0" # Dingbats | |
"\U000024C2-\U0001F251" | |
"]+", | |
flags=re.UNICODE, | |
) | |
text = emoji_pattern.sub("", text) | |
text = re.sub(r"#\w+", "", text) | |
text = re.sub(r'[^\w\s\d.,!?\'"()-;]', "", text) | |
text = re.sub(r"\s+([.,!?;])", r"\1", text) | |
text = re.sub(r"([.,!?;])(\S)", r"\1 \2", text) | |
text = re.sub(r"\s+", " ", text).strip() | |
return text | |
def remove_special_characters_2(text): | |
pattern = r"[^a-zA-Z0-9 ]+" | |
text = re.sub(pattern, "", text) | |
return text | |
def split_into_sentences(text): | |
sentences = re.split(r"(?<=[.!?]) +", text) | |
return sentences | |
def get_token_length(tokenizer, sentence): | |
return len(tokenizer.tokenize(sentence)) | |
MC_TOKEN_SIZE = 256 | |
BC_TOKEN_SIZE = 333 | |
def split_text_allow_complete_sentences_nltk(text, type_det="bc", tokenizer=None): | |
sentences = sent_tokenize(text) | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
if type_det == "bc": | |
max_tokens = BC_TOKEN_SIZE | |
elif type_det == "mc": | |
max_tokens = MC_TOKEN_SIZE | |
elif type_det == "quillbot": | |
max_tokens = 256 | |
def add_sentence_to_chunk(sentence): | |
nonlocal current_chunk, current_length | |
sentence_length = get_token_length(tokenizer, sentence) | |
if current_length + sentence_length > max_tokens: | |
chunks.append((current_chunk, current_length)) | |
current_chunk = [] | |
current_length = 0 | |
current_chunk.append(sentence) | |
current_length += sentence_length | |
for sentence in sentences: | |
add_sentence_to_chunk(sentence) | |
if current_chunk: | |
chunks.append((current_chunk, current_length)) | |
adjusted_chunks = [] | |
while chunks: | |
chunk = chunks.pop(0) | |
if len(chunks) > 0 and chunk[1] < max_tokens / 2: | |
next_chunk = chunks.pop(0) | |
combined_length = chunk[1] + next_chunk[1] | |
if combined_length <= max_tokens: | |
adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length)) | |
else: | |
adjusted_chunks.append(chunk) | |
chunks.insert(0, next_chunk) | |
else: | |
adjusted_chunks.append(chunk) | |
result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks] | |
return result_chunks | |