|
import json
|
|
import time
|
|
from .utils import get_sbert_embedding, clean_text
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from nltk import sent_tokenize
|
|
import requests
|
|
|
|
|
|
MAX_LENGTH_FEATURE = 250
|
|
MIN_LENGTH_FEATURE = 100
|
|
URL_CHECK_SPAM = "http://10.9.3.70:30036/predict"
|
|
|
|
def check_spam(docs):
|
|
json_body = {
|
|
"domain_id": "",
|
|
"records": [
|
|
{
|
|
"text": doc.get("message",""),
|
|
"idxcol": 1
|
|
} for doc in docs
|
|
]
|
|
}
|
|
|
|
result = requests.post(URL_CHECK_SPAM, json = json_body).json()
|
|
docs = [x for i,x in enumerate(docs) if result[i]["label"] == 0]
|
|
return docs
|
|
|
|
def preocess_feature(doc):
|
|
message = doc.get("message","")
|
|
paras = message.split("\n")
|
|
feature = ""
|
|
paras = [clean_text(x.strip(), normalize=False) for x in paras if x.strip() and len(x.strip()) > 10]
|
|
for para in paras:
|
|
if len(feature) + len(para) < MAX_LENGTH_FEATURE:
|
|
feature += " " +para
|
|
elif len(feature) < MIN_LENGTH_FEATURE:
|
|
sens = sent_tokenize(para)
|
|
for sen in sens:
|
|
if len(feature) + len(sen) < MAX_LENGTH_FEATURE or len(feature.strip()) < MIN_LENGTH_FEATURE:
|
|
feature += " " +sen
|
|
return feature
|
|
|
|
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True, is_check_spam = True):
|
|
|
|
|
|
docs = [x for x in docs if len(x.get("message","")) > 100]
|
|
docs = docs[:30000]
|
|
if is_check_spam:
|
|
docs = check_spam(docs)
|
|
result = {}
|
|
cluster_score = {}
|
|
|
|
t1 = time.time()
|
|
if len(docs) < 1:
|
|
return result
|
|
elif len(docs) == 1:
|
|
return {
|
|
"0": docs
|
|
}
|
|
|
|
|
|
|
|
f_docs = []
|
|
for x in docs:
|
|
ft = preocess_feature(x)
|
|
if len(ft) > MIN_LENGTH_FEATURE:
|
|
x["title"] = ft
|
|
f_docs.append(x)
|
|
docs = f_docs
|
|
|
|
features = [x["title"] for x in docs ]
|
|
|
|
|
|
|
|
vectors = get_sbert_embedding(features)
|
|
|
|
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
|
linkage='complete', distance_threshold=distance_threshold)
|
|
clusteror.fit(vectors)
|
|
print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
|
|
for i in range(clusteror.n_clusters_):
|
|
result[str(i + 1)] = []
|
|
cluster_score[str(i + 1)] = 0
|
|
for i in range(len(clusteror.labels_)):
|
|
cluster_no = clusteror.labels_[i]
|
|
if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
|
response_doc = {}
|
|
response_doc = docs[i]
|
|
score = response_doc.get('score', 0)
|
|
if not docs[i].get('message','').strip():
|
|
continue
|
|
if score > cluster_score[str(cluster_no + 1)]:
|
|
cluster_score[str(cluster_no + 1)] = score
|
|
if 'domain' in docs[i]:
|
|
response_doc['domain'] = docs[i]['domain']
|
|
if 'url' in docs[i]:
|
|
response_doc['url'] = docs[i]['url']
|
|
if 'title' in docs[i]:
|
|
response_doc['title'] = clean_text(docs[i]['title'])
|
|
if 'snippet' in docs[i]:
|
|
response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
|
if 'created_time' in docs[i]:
|
|
response_doc['created_time'] = docs[i]['created_time']
|
|
if "sentiment" in docs[i]:
|
|
response_doc['sentiment'] = docs[i]['sentiment']
|
|
if 'message' in docs[i]:
|
|
title = docs[i].get('title','')
|
|
snippet = docs[i].get('snippet','')
|
|
message = docs[i].get('message','')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response_doc['message'] = clean_text(message)
|
|
if 'id' in docs[i]:
|
|
response_doc['id'] = docs[i]['id']
|
|
|
|
|
|
|
|
|
|
|
|
result[str(cluster_no + 1)].append(response_doc)
|
|
|
|
empty_clus_ids = []
|
|
for x in result:
|
|
result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
|
if len( result[x]) > 0:
|
|
|
|
|
|
result[x][0]['num_docs'] = len(result[x])
|
|
result[x][0]['max_score'] = cluster_score[x]
|
|
else:
|
|
empty_clus_ids.append(x)
|
|
|
|
for x in empty_clus_ids:
|
|
result.pop(x,None)
|
|
|
|
result = dict( sorted(result.items(), key=lambda i: -len(i[1]))[:top_cluster])
|
|
return result
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
with open("/home2/vietle/news-cms/topic_summarization/data/news_cms.social.json", 'r') as f:
|
|
docs = json.load(f)[:10000]
|
|
clusters = topic_clustering(docs, distance_threshold=0.2, top_cluster=5000, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=False)
|
|
with open("/home2/vietle/news-cms/topic_summarization/cluster/news_cms.social.json", 'w') as f:
|
|
|
|
json.dump(clusters,f, ensure_ascii =False) |