import re from vncorenlp import VnCoreNLP from nltk.tokenize import sent_tokenize import torch from sentence_transformers import SentenceTransformer import datetime from sklearn.cluster import AgglomerativeClustering import numpy as np import requests import json from . import utils import time from summary import text_summary, get_summary_bert # from . import detect_time as dt device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base').to(device) model_en = SentenceTransformer('paraphrase-mpnet-base-v2').to(device) annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx8g') def detect_postaging(text_in): word_segmented_text = annotator.annotate(text_in) lst_k = [] for se in word_segmented_text["sentences"]: for kw in se: if kw["posTag"] in ("Np", "Ny", "N"): if kw["posTag"] == "N" and "_" not in kw["form"]: continue lst_k.append(kw["form"].replace("_", " ")) return list(set(lst_k)) def clean_text(text_in): doc = re.sub('<.*?>', '', text_in) doc = re.sub('(function).*}', ' ', doc) # link doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc) doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc) doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc) doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc) # escape sequence doc = re.sub('\n', ' ', doc) doc = re.sub('\t', ' ', doc) doc = re.sub('\r', ' ', doc) return doc def data_cleaning(docs): res = [] for d in docs: if 'message' in d: # css and js doc = re.sub('<.*?>', '', d['message']) doc = re.sub('(function).*}', ' ', doc) # link doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc) doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc) doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc) doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc) doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc) doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc) # escape sequence doc = re.sub('\n', ' ', doc) doc = re.sub('\t', ' ', doc) doc = re.sub('\r', ' ', doc) d['message'] = doc res.append(d) return res def segment(docs, lang="vi"): segmented_docs = [] for d in docs: # if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100: # continue if 'snippet' not in d and 'title' not in d: continue try: if lang == "vi": snippet = d.get('snippet', "") segmented_snippet = "" segmented_sentences_snippet = annotator.tokenize(snippet) for sentence in segmented_sentences_snippet: segmented_snippet += ' ' + ' '.join(sentence) segmented_snippet = segmented_snippet.replace('\xa0', '') d['segmented_snippet'] = segmented_snippet segmented_docs.append(d) except Exception: pass return segmented_docs def timestamp_to_date(timestamp): return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y') def sort_content(lst_res): lst_content = [] lst_cnt = [] for i in range(len(lst_res)): lst_cnt.append(len(lst_res[i].get("message", ""))) id_sort = np.argsort(np.array(lst_cnt))[::-1] for i in id_sort: lst_content.append(lst_res[i]) return lst_content def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5): lst_ids = [] lst_top = [] lst_res = [] for i in response: lst_ids.append(i) lst_top.append(len(response[i])) idx = np.argsort(np.array(lst_top))[::-1] if top_cluster == -1: top_cluster = len(idx) for i in idx[: top_cluster]: ik = lst_ids[i] if top_sentence == -1: top_sentence = len(response[ik]) lst_check_title = [] lst_check_not_title = [] i_c_t = 0 response_sort = sort_content(response[ik].copy()) for resss in response_sort: if resss.get("title", ""): lst_check_title.append(resss) i_c_t += 1 else: lst_check_not_title.append(resss) if i_c_t == top_sentence: break if i_c_t == top_sentence: lst_res.append(lst_check_title) else: lst_check_title.extend(lst_check_not_title) lst_res.append(lst_check_title[:top_sentence]) dict_res = {} for i in range(len(lst_res)): dict_res[str(i + 1)] = lst_res[i] for j in range(min(len(dict_res[str(i + 1)]), 3)): dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", "")) summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), lang = dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary) if len(summary_text) < 10: summary_text = dict_res[str(i + 1)][0].get("snippet", "") if len(summary_text) < 10: summary_text = dict_res[str(i + 1)][0].get("title", "") dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text) kew_phares = [] dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares for j in range(len(dict_res[str(i + 1)])): if "message" in dict_res[str(i + 1)][j]: del dict_res[str(i + 1)][j]["message"] return dict_res def get_lang(docs): lang_vi = 0 lang_en = 0 docs_lang_vi = [] docs_lang_en = [] for d in docs: if d.get("lang", "") == "en": lang_en += 1 docs_lang_en.append(d) else: lang_vi += 1 docs_lang_vi.append(d) if lang_vi > lang_en: return "vi", docs_lang_vi return "en", docs_lang_en def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, benchmark_id=1): global model, model_en lang, docs = get_lang(docs) result = {} docs = segment(docs, lang=lang) print("docs segment: ", len(docs)) if len(docs) < 2: return result if lang == "vi": features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs] vectors = model.encode(features, show_progress_bar=False) else: features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs] vectors = model_en.encode(features, show_progress_bar=False) clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine', linkage='single', distance_threshold=distance_threshold) clusteror.fit(vectors) print(clusteror.n_clusters_) for i in range(clusteror.n_clusters_): result[str(i + 1)] = [] for i in range(len(clusteror.labels_)): cluster_no = clusteror.labels_[i] response_doc = {} if 'url' in docs[i]: response_doc['url'] = docs[i]['url'] if 'domain' in docs[i]: response_doc['domain'] = docs[i]['domain'] if 'title' in docs[i]: response_doc['title'] = clean_text(docs[i]['title']) if 'snippet' in docs[i]: response_doc['snippet'] = clean_text(docs[i]['snippet']) if 'created_time' in docs[i]: response_doc['created_time'] = docs[i]['created_time'] if 'message' in docs[i]: response_doc['message'] = clean_text(docs[i]['message']) if 'id' in docs[i]: response_doc['id'] = docs[i]['id'] response_doc['score'] = 0.0 response_doc['title_summarize'] = [] response_doc['content_summary'] = "" response_doc['total_facebook_viral'] = 0 result[str(cluster_no + 1)].append(response_doc) # print("before filter: ", len(result)) # result = smart_filter(result, benchmark_id=benchmark_id) # print("after filter: ", len(result)) return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary) def convert_date(text): text = text.replace(".", "/") text = text.replace("-", "/") return text def check_keyword(sentence): keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới'] for k in keyword: if k in sentence: return True return False def extract_events_and_time(docs, publish_date): def standardize(date_str): return date_str.replace('.', '/').replace('-', '/') def add_0(date_str): date_str = date_str.split('/') res = [] for o in date_str: o = re.sub('\s+', '', o) if len(o) < 2: o = '0' + o res.append(o) date_str = '/'.join(res) return date_str def get_date_list(reg, sentence): find_object = re.finditer(reg, sentence) date_list = [x.group() for x in find_object] return date_list year = publish_date.split('/')[2] # dd/mm/yyyy reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)' # #mm/yyyy # reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)' # dd/mm reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)' # ngày dd tháng mm năm yyyy reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}' # ngày dd tháng mm reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}' result = [] for d in docs: text = d['message'] for sentence in sent_tokenize(text): lower_sentence = sentence.lower() c = re.search(reg_exp_3, sentence.lower()) d = re.search(reg_exp_4, sentence.lower()) # e = re.search(reg_exp_5, sentence.lower()) a = re.search(reg_exp_1, sentence) b = re.search(reg_exp_2, sentence) # if (a or b or c or d) and check_keyword(lower_sentence): date_list = get_date_list(reg_exp_1, lower_sentence) date_entity = '' if date_list: date_entity = add_0(standardize(date_list[0])) elif get_date_list(reg_exp_2, lower_sentence): date_list = get_date_list(reg_exp_2, lower_sentence) date_entity = add_0(standardize(date_list[0]) + '/' + year) elif get_date_list(reg_exp_3, lower_sentence): date_list = get_date_list(reg_exp_3, lower_sentence) date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip() date_entity = re.sub('\s+', ' ', date_entity) date_entity = date_entity.replace(' ', '/') date_entity = add_0(date_entity) else: date_list = get_date_list(reg_exp_4, lower_sentence) if date_list != []: date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip() date_entity = re.sub('\s+', ' ', date_entity) date_entity = date_entity.replace(' ', '/') date_entity = date_entity + '/' + year date_entity = add_0(date_entity) result.append((sentence, date_entity)) return result