|
import pandas as pd |
|
from transformers import AutoTokenizer, AutoModel |
|
from sentence_transformers import SentenceTransformer, util |
|
import numpy as np |
|
import torch |
|
import gradio as gr |
|
|
|
|
|
|
|
categories = [ |
|
{ |
|
"topic": "Confidentiality and Privacy Protection", |
|
"description": "This topic covers the protection of confidentiality, privacy, and integrity in security systems. It also includes authentication and authorization processes.", |
|
"experts": ["Mireille"] |
|
}, |
|
{ |
|
"topic": "Distributed Trust and End-User Trust Models", |
|
"description": "This topic focuses on distributed trust models and how end-users establish trust in secure systems.", |
|
"experts": ["Mireille", "Khawla"] |
|
}, |
|
{ |
|
"topic": "Secure Element and Key Provisioning", |
|
"description": "This topic involves the secure element in systems and the process of key provisioning.", |
|
"experts": ["Mireille"] |
|
}, |
|
{ |
|
"topic": "Residential Gateway Security", |
|
"description": "This topic covers the security aspects of Residential Gateways.", |
|
"experts": ["Mireille"] |
|
}, |
|
{ |
|
"topic": "Standalone Non-Public Network (SNPN) Inter-Connection and Cybersecurity", |
|
"description": "This topic focuses on the inter-connection of Standalone Non-Public Networks and related cyber-security topics.", |
|
"experts": ["Khawla"] |
|
}, |
|
{ |
|
"topic": "Distributed Ledger and Blockchain in SNPN", |
|
"description": "This topic covers the use of distributed ledger technology and blockchain in securing Standalone Non-Public Networks.", |
|
"experts": ["Khawla"] |
|
}, |
|
{ |
|
"topic": "Distributed Networks and Communication", |
|
"description": "This topic involves distributed networks such as mesh networks, ad-hoc networks, and multi-hop networks, and their cyber-security aspects.", |
|
"experts": ["Guillaume"] |
|
}, |
|
{ |
|
"topic": "Swarm of Drones and Unmanned Aerial Vehicles Network Infrastructure", |
|
"description": "This topic covers the network infrastructure deployed by Swarm of Drones and Unmanned Aerial Vehicles.", |
|
"experts": ["Guillaume"] |
|
}, |
|
{ |
|
"topic": "USIM and Over-the-Air Services", |
|
"description": "This topic involves USIM and related over-the-air services such as Steering of Roaming, roaming services, network selection, and UE configuration.", |
|
"experts": ["Vincent"] |
|
}, |
|
{ |
|
"topic": "Eco-Design and Societal Impact of Technology", |
|
"description": "This topic covers eco-design concepts, including energy saving, energy efficiency, carbon emissions, and the societal impact of technology.", |
|
"experts": ["Pierre"] |
|
}, |
|
{ |
|
"topic": "Service Requirements of New Services", |
|
"description": "This topic involves defining service requirements for new services, detecting low signals of new trends and technologies, and assessing their impact on USIM services or over-the-air services.", |
|
"experts": ["Ly-Thanh"] |
|
}, |
|
{ |
|
"topic": "Satellite and Non Terrestrial Networks", |
|
"description": "This topic covers satellite networks, Non Terrestrial Networks, Private Networks, IoT, Inter Satellite communication, and Radio Access Network.", |
|
"experts": ["Nicolas"] |
|
}, |
|
{ |
|
"topic": "Public Safety and Emergency Communication", |
|
"description": "This topic involves Public Safety Communication, Military Communication, Emergency Calls, Emergency Services, Disaster Communication Access, and other related areas.", |
|
"experts": ["Dorin"] |
|
}, |
|
{ |
|
"topic": "Identifying the Human User of a Subscription", |
|
"description": "This topic involves methods and processes for identifying the human user associated with a subscription.", |
|
"experts": ["Kumar"] |
|
}, |
|
{ |
|
"topic": "Authentication and Authorization of Users and Restrictions on Users", |
|
"description": "This topic covers authentication and authorization processes, as well as restrictions imposed on users.", |
|
"experts": ["Kumar"] |
|
}, |
|
{ |
|
"topic": "Exposure of User Identity Profile Information", |
|
"description": "This topic involves the exposure of user identity profile information and its security implications.", |
|
"experts": ["Kumar"] |
|
}, |
|
{ |
|
"topic": "Identifying non-3GPP Devices Connecting behind a UE or 5G-RG", |
|
"description": "This topic involves identifying non-3GPP devices connecting behind a UE (User Equipment) or 5G-RG (5G Residential Gateway).", |
|
"experts": ["Kumar"] |
|
} |
|
] |
|
|
|
def add_categories(df,df_all): |
|
categories = df.to_dict("records") |
|
categories_all = df_all.to_dict("list") |
|
for cat in categories: |
|
if cat['topic'] not in categories_all['topic']: |
|
categories_all['topic'].append(cat['topic']) |
|
categories_all['description'].append(cat['description']) |
|
categories_all['experts'].append(cat['experts']) |
|
print(f"AFTER ADDINGS Those are the categories_all : {categories_all}") |
|
|
|
return gr.update(choices=categories_all['topic']),pd.DataFrame.from_dict(categories_all) |
|
|
|
df_cate = pd.DataFrame(categories) |
|
df_cat_filter = df_cate.to_dict("list")["topic"] |
|
|
|
def filter_by_topics(filters, categories): |
|
value_filtered = [] |
|
categories = categories.to_dict("records") |
|
for cat in categories: |
|
if cat['topic'] in filters: |
|
value_filtered.append(cat) |
|
|
|
return gr.DataFrame(label='categories', value=pd.DataFrame(value_filtered), interactive=True) |
|
|
|
|
|
|
|
def reset_cate(df_categories): |
|
if df_categories.equals(df_cate): |
|
df_categories = pd.DataFrame([['', '', '']], columns=['topic', 'description', 'expert']) |
|
else: |
|
df_categories = df_cate.copy() |
|
return df_categories |
|
|
|
|
|
def load_data(file_obj): |
|
|
|
return pd.read_excel(file_obj) |
|
|
|
|
|
def initialize_models(): |
|
model_ST = SentenceTransformer("all-mpnet-base-v2") |
|
return model_ST |
|
|
|
|
|
def generate_embeddings(df, model, Column): |
|
embeddings_list = [] |
|
for index, row in df.iterrows(): |
|
if type(row[Column]) == str: |
|
print(index) |
|
if 'Title' in df.columns: |
|
if type(row["Title"]) == str: |
|
content = row["Title"] + "\n" + row[Column] |
|
else: |
|
content = row[Column] |
|
else: |
|
content = row[Column] |
|
embeddings = model.encode(content, convert_to_tensor=True) |
|
embeddings_list.append(embeddings) |
|
else: |
|
embeddings_list.append(np.nan) |
|
df['Embeddings'] = embeddings_list |
|
return df |
|
|
|
|
|
def process_categories(categories, model): |
|
|
|
df_cate = pd.DataFrame(categories) |
|
|
|
|
|
df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1) |
|
|
|
return df_cate |
|
|
|
|
|
|
|
def match_categories(df, category_df, treshold=0.45): |
|
|
|
categories_list, experts_list, topic_list, scores_list = [], [], [], [] |
|
for ebd_content in df['Embeddings']: |
|
if isinstance(ebd_content, torch.Tensor): |
|
cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0] |
|
high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold] |
|
|
|
|
|
categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices]) |
|
experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices]) |
|
topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices]) |
|
scores_list.append([float(cos_scores[index]) for index in high_score_indices]) |
|
else: |
|
categories_list.append(np.nan) |
|
experts_list.append(np.nan) |
|
topic_list.append(np.nan) |
|
scores_list.append('pas interessant') |
|
|
|
df["Description"] = categories_list |
|
df["Expert"] = experts_list |
|
df["Topic"] = topic_list |
|
df["Score"] = scores_list |
|
return df |
|
|
|
def flatten_nested_lists(nested_list): |
|
"""Flatten a list of potentially nested lists into a single list.""" |
|
flattened_list = [] |
|
for item in nested_list: |
|
if isinstance(item, list): |
|
flattened_list.extend(flatten_nested_lists(item)) |
|
else: |
|
flattened_list.append(item) |
|
return flattened_list |
|
|
|
def save_data(df, filename): |
|
|
|
df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x) |
|
df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) |
|
df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) |
|
df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x) |
|
|
|
df = df.drop(columns=['Embeddings']) |
|
new_filename = filename.replace(".", "_classified.") |
|
df.to_excel(new_filename, index=False) |
|
return new_filename |
|
|
|
def classification(column, file_path, categories, treshold): |
|
|
|
df = load_data(file_path) |
|
|
|
|
|
model_ST = initialize_models() |
|
|
|
|
|
df = generate_embeddings(df, model_ST, column) |
|
|
|
|
|
category_df = process_categories(categories, model_ST) |
|
|
|
|
|
df = match_categories(df, category_df, treshold=treshold) |
|
|
|
|
|
return save_data(df,file_path), df |
|
|