import pandas as pd import re def treat_euro(text): text = re.sub(r'(euro[^s])|(euros)|(€)', ' euros', text) return text def treat_m2(text): text = re.sub(r'(m2)|(m²)', ' m²', text) return text def filter_phone_numbers(text): pattern = r'(?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2})|(\d{2}[ ]\d{2}[ ]\d{3}[ ]\d{3})' text = re.sub(pattern, '', text) return text def filter_ibans(text): pattern = r'fr\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{2}|fr\d{20}|fr[ ]\d{2}[ ]\d{3}[ ]\d{3}[ ]\d{3}[ ]\d{5}' text = re.sub(pattern, '', text) return text def remove_space_between_numbers(text): text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) return text def filter_emails(text): pattern = r'(?:(?!.*?[.]{2})[a-zA-Z0-9](?:[a-zA-Z0-9.+!%-]{1,64}|)|\"[a-zA-Z0-9.+!% -]{1,64}\")@[a-zA-Z0-9][a-zA-Z0-9.-]+(.[a-z]{2,}|.[0-9]{1,})' text = re.sub(pattern, '', text) return text def filter_ref(text): pattern = r'(\(*)(ref|réf)(\.|[ ])\d+(\)*)' text = re.sub(pattern, '', text) return text def filter_websites(text): pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*' text = re.sub(pattern, '', text) return text def preprocess_text_for_camembert(text): text = text.lower() text = text.replace(u'\xa0', u' ') text = treat_m2(text) text = treat_euro(text) text = filter_phone_numbers(text) text = filter_emails(text) text = filter_ibans(text) text = filter_ref(text) text = filter_websites(text) text = remove_space_between_numbers(text) return text