import re import re from sentence_transformers import SentenceTransformer, util import re from unidecode import unidecode from transformers import AutoTokenizer import yaml import fitz def remove_accents(input_str): text_no_accents = unidecode(input_str) return text_no_accents def remove_special_characters(text): text = re.sub(r'https?://\S+|www\.\S+', '', text) emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F700-\U0001F77F" # alchemical symbols u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs u"\U0001FA00-\U0001FA6F" # Chess Symbols u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A u"\U00002702-\U000027B0" # Dingbats u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) text = emoji_pattern.sub('', text) text = re.sub(r'#\w+', '', text) text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text) text = re.sub(r'\s+([.,!?;])', r'\1', text) text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text) text = re.sub(r'\s+', ' ', text).strip() return text def remove_special_characters_2(text): pattern = r"[^a-zA-Z0-9 ]+" text = re.sub(pattern, "", text) return text def update_character_count(text): return f"{len(text)} characters" with open("config.yaml", "r") as file: params = yaml.safe_load(file) text_bc_model_path = params["TEXT_BC_MODEL_PATH"] text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) def len_validator(text): min_tokens = 200 lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt")) if lengt < min_tokens: return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens." else: return f"Input length ({lengt}) is satisified." def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text def format_headings(text): lines = text.split(" ") formatted_lines = [] heading = "" for line in lines: if line and line.isupper(): heading += line + " " else: if heading != "" and len(heading) > 10: formatted = ( "\n" + heading[: len(heading) - 2] + "\n" + heading[len(heading) - 2 :] if heading.strip().endswith(" A") else "\n" + heading + "\n" ) formatted_lines.append(formatted.strip(" ")) elif heading != "": formatted_lines.append(heading.strip()) formatted_lines.append(line.strip()) heading = "" return " ".join(formatted_lines) def format_live_site(text): # insert a newline between lowercase and uppercase letters formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text) # format the "What's included" items formatted_text = re.sub( r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text ) # place headings in all caps on their own line formatted_text = format_headings(formatted_text) # ddd a space after ':', ';', ',', '!', '?' if they are followed by a character formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text) return formatted_text def extract_text_from_html(url): try: r = requests.get(url) if r.status_code == 200: soup = BeautifulSoup(r.content, "html.parser") except Exception: return "Unable to extract URL" def remove_tags(soup): # parse html content for data in soup(["style", "script", "code", "a"]): # Remove tags data.decompose() # return data by retrieving the tag content return " ".join(soup.stripped_strings) text = remove_tags(soup) text = format_live_site(text) return text