article_writer / utils.py
eljanmahammadli's picture
Added MC model to UI and removed some unnecessary code
5534eb0
import re
from unidecode import unidecode
from nltk import sent_tokenize
# from transformers import AutoTokenizer
# import yaml
# import fitz
# import requests
# from bs4 import BeautifulSoup
# from collections import defaultdict
def remove_accents(input_str):
text_no_accents = unidecode(input_str)
return text_no_accents
def remove_special_characters(text):
text = re.sub(r"https?://\S+|www\.\S+", "", text)
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U00002702-\U000027B0" # Dingbats
"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE,
)
text = emoji_pattern.sub("", text)
text = re.sub(r"#\w+", "", text)
text = re.sub(r'[^\w\s\d.,!?\'"()-;]', "", text)
text = re.sub(r"\s+([.,!?;])", r"\1", text)
text = re.sub(r"([.,!?;])(\S)", r"\1 \2", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def remove_special_characters_2(text):
pattern = r"[^a-zA-Z0-9 ]+"
text = re.sub(pattern, "", text)
return text
def split_into_sentences(text):
sentences = re.split(r"(?<=[.!?]) +", text)
return sentences
def get_token_length(tokenizer, sentence):
return len(tokenizer.tokenize(sentence))
MC_TOKEN_SIZE = 256
BC_TOKEN_SIZE = 333
def split_text_allow_complete_sentences_nltk(text, type_det="bc", tokenizer=None):
sentences = sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
if type_det == "bc":
max_tokens = BC_TOKEN_SIZE
elif type_det == "mc":
max_tokens = MC_TOKEN_SIZE
elif type_det == "quillbot":
max_tokens = 256
def add_sentence_to_chunk(sentence):
nonlocal current_chunk, current_length
sentence_length = get_token_length(tokenizer, sentence)
if current_length + sentence_length > max_tokens:
chunks.append((current_chunk, current_length))
current_chunk = []
current_length = 0
current_chunk.append(sentence)
current_length += sentence_length
for sentence in sentences:
add_sentence_to_chunk(sentence)
if current_chunk:
chunks.append((current_chunk, current_length))
adjusted_chunks = []
while chunks:
chunk = chunks.pop(0)
if len(chunks) > 0 and chunk[1] < max_tokens / 2:
next_chunk = chunks.pop(0)
combined_length = chunk[1] + next_chunk[1]
if combined_length <= max_tokens:
adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
else:
adjusted_chunks.append(chunk)
chunks.insert(0, next_chunk)
else:
adjusted_chunks.append(chunk)
result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
return result_chunks