Spaces:
Running
Running
import re | |
import re | |
from sentence_transformers import SentenceTransformer, util | |
import re | |
from unidecode import unidecode | |
from transformers import AutoTokenizer | |
import yaml | |
import fitz | |
def remove_accents(input_str): | |
text_no_accents = unidecode(input_str) | |
return text_no_accents | |
def remove_special_characters(text): | |
text = re.sub(r'https?://\S+|www\.\S+', '', text) | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F700-\U0001F77F" # alchemical symbols | |
u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended | |
u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
u"\U0001FA00-\U0001FA6F" # Chess Symbols | |
u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
u"\U00002702-\U000027B0" # Dingbats | |
u"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE) | |
text = emoji_pattern.sub('', text) | |
text = re.sub(r'#\w+', '', text) | |
text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text) | |
text = re.sub(r'\s+([.,!?;])', r'\1', text) | |
text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def remove_special_characters_2(text): | |
pattern = r"[^a-zA-Z0-9 ]+" | |
text = re.sub(pattern, "", text) | |
return text | |
def update_character_count(text): | |
return f"{len(text)} characters" | |
with open("config.yaml", "r") as file: | |
params = yaml.safe_load(file) | |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"] | |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) | |
def len_validator(text): | |
min_tokens = 200 | |
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt")) | |
if lengt < min_tokens: | |
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens." | |
else: | |
return f"Input length ({lengt}) is satisified." | |
def extract_text_from_pdf(pdf_path): | |
doc = fitz.open(pdf_path) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def format_headings(text): | |
lines = text.split(" ") | |
formatted_lines = [] | |
heading = "" | |
for line in lines: | |
if line and line.isupper(): | |
heading += line + " " | |
else: | |
if heading != "" and len(heading) > 10: | |
formatted = ( | |
"\n" | |
+ heading[: len(heading) - 2] | |
+ "\n" | |
+ heading[len(heading) - 2 :] | |
if heading.strip().endswith(" A") | |
else "\n" + heading + "\n" | |
) | |
formatted_lines.append(formatted.strip(" ")) | |
elif heading != "": | |
formatted_lines.append(heading.strip()) | |
formatted_lines.append(line.strip()) | |
heading = "" | |
return " ".join(formatted_lines) | |
def format_live_site(text): | |
# insert a newline between lowercase and uppercase letters | |
formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text) | |
# format the "What's included" items | |
formatted_text = re.sub( | |
r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text | |
) | |
# place headings in all caps on their own line | |
formatted_text = format_headings(formatted_text) | |
# ddd a space after ':', ';', ',', '!', '?' if they are followed by a character | |
formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text) | |
return formatted_text | |
def extract_text_from_html(url): | |
try: | |
r = requests.get(url) | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.content, "html.parser") | |
except Exception: | |
return "Unable to extract URL" | |
def remove_tags(soup): | |
# parse html content | |
for data in soup(["style", "script", "code", "a"]): | |
# Remove tags | |
data.decompose() | |
# return data by retrieving the tag content | |
return " ".join(soup.stripped_strings) | |
text = remove_tags(soup) | |
text = format_live_site(text) | |
return text | |