import pandas as pd from transformers import AutoTokenizer, AutoModel from sentence_transformers import SentenceTransformer, util import numpy as np import torch import gradio as gr import spaces # Categories categories = [ { "topic": "Confidentiality and Privacy Protection", "description": "This topic covers the protection of confidentiality, privacy, and integrity in security systems. It also includes authentication and authorization processes.", "experts": ["Mireille"] }, { "topic": "Distributed Trust and End-User Trust Models", "description": "This topic focuses on distributed trust models and how end-users establish trust in secure systems.", "experts": ["Mireille", "Khawla"] }, { "topic": "Secure Element and Key Provisioning", "description": "This topic involves the secure element in systems and the process of key provisioning.", "experts": ["Mireille"] }, { "topic": "Residential Gateway Security", "description": "This topic covers the security aspects of Residential Gateways.", "experts": ["Mireille"] }, { "topic": "Standalone Non-Public Network (SNPN) Inter-Connection and Cybersecurity", "description": "This topic focuses on the inter-connection of Standalone Non-Public Networks and related cyber-security topics.", "experts": ["Khawla"] }, { "topic": "Distributed Ledger and Blockchain in SNPN", "description": "This topic covers the use of distributed ledger technology and blockchain in securing Standalone Non-Public Networks.", "experts": ["Khawla"] }, { "topic": "Distributed Networks and Communication", "description": "This topic involves distributed networks such as mesh networks, ad-hoc networks, and multi-hop networks, and their cyber-security aspects.", "experts": ["Guillaume"] }, { "topic": "Swarm of Drones and Unmanned Aerial Vehicles Network Infrastructure", "description": "This topic covers the network infrastructure deployed by Swarm of Drones and Unmanned Aerial Vehicles.", "experts": ["Guillaume"] }, { "topic": "USIM and Over-the-Air Services", "description": "This topic involves USIM and related over-the-air services such as Steering of Roaming, roaming services, network selection, and UE configuration.", "experts": ["Vincent"] }, { "topic": "Eco-Design and Societal Impact of Technology", "description": "This topic covers eco-design concepts, including energy saving, energy efficiency, carbon emissions, and the societal impact of technology.", "experts": ["Pierre"] }, { "topic": "Service Requirements of New Services", "description": "This topic involves defining service requirements for new services, detecting low signals of new trends and technologies, and assessing their impact on USIM services or over-the-air services.", "experts": ["Ly-Thanh"] }, { "topic": "Satellite and Non Terrestrial Networks", "description": "This topic covers satellite networks, Non Terrestrial Networks, Private Networks, IoT, Inter Satellite communication, and Radio Access Network.", "experts": ["Nicolas"] }, { "topic": "Public Safety and Emergency Communication", "description": "This topic involves Public Safety Communication, Military Communication, Emergency Calls, Emergency Services, Disaster Communication Access, and other related areas.", "experts": ["Dorin"] }, { "topic": "Identifying the Human User of a Subscription", "description": "This topic involves methods and processes for identifying the human user associated with a subscription.", "experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés }, { "topic": "Authentication and Authorization of Users and Restrictions on Users", "description": "This topic covers authentication and authorization processes, as well as restrictions imposed on users.", "experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés }, { "topic": "Exposure of User Identity Profile Information", "description": "This topic involves the exposure of user identity profile information and its security implications.", "experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés }, { "topic": "Identifying non-3GPP Devices Connecting behind a UE or 5G-RG", "description": "This topic involves identifying non-3GPP devices connecting behind a UE (User Equipment) or 5G-RG (5G Residential Gateway).", "experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés } ] def add_categories(df,df_all): categories = df.to_dict("records") categories_all = df_all.to_dict("list") for cat in categories: if cat['topic'] not in categories_all['topic']: categories_all['topic'].append(cat['topic']) categories_all['description'].append(cat['description']) categories_all['experts'].append(cat['experts']) print(f"AFTER ADDINGS Those are the categories_all : {categories_all}") return gr.update(choices=categories_all['topic']),pd.DataFrame.from_dict(categories_all) df_cate = pd.DataFrame(categories) df_cat_filter = df_cate.to_dict("list")["topic"] def filter_by_topics(filters, categories): value_filtered = [] categories = categories.to_dict("records") for cat in categories: if cat['topic'] in filters: value_filtered.append(cat) return gr.DataFrame(label='categories', value=pd.DataFrame(value_filtered), interactive=True) ### End def reset_cate(df_categories): if df_categories.equals(df_cate): df_categories = pd.DataFrame([['', '', '']], columns=['topic', 'description', 'expert']) else: df_categories = df_cate.copy() return df_categories def load_data(file_obj): # Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it return pd.read_excel(file_obj) def initialize_models(): model_ST = SentenceTransformer("all-mpnet-base-v2",device = "cuda") return model_ST @spaces.GPU def generate_embeddings(df, model, Column): embeddings_list = [] for index, row in df.iterrows(): if type(row[Column]) == str: print(index) if 'Title' in df.columns: if type(row["Title"]) == str: content = row["Title"] + "\n" + row[Column] else: content = row[Column] else: content = row[Column] embeddings = model.encode(content, convert_to_tensor=True) embeddings_list.append(embeddings) else: embeddings_list.append(np.nan) df['Embeddings'] = embeddings_list return df def process_categories(categories, model): # Create a new DataFrame to store category information and embeddings df_cate = pd.DataFrame(categories) # Generate embeddings for each category description df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1) return df_cate def match_categories(df, category_df, treshold=0.45): categories_list, experts_list, topic_list, scores_list = [], [], [], [] for topic in category_df['topic']: df[topic] = 0 for index, ebd_content in enumerate(df['Embeddings']): if isinstance(ebd_content, torch.Tensor): cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0] high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold] categories_list.append("@~@".join([category_df.loc[index, 'description'] for index in high_score_indices])) experts_list.append([list(set(category_df.loc[index, 'experts'])) for index in high_score_indices]) topic_list.append("@~@".join([category_df.loc[index, 'topic'] for index in high_score_indices])) scores_list.append("@~@".join([str(float(cos_scores[index])) for index in high_score_indices])) for j in high_score_indices: df.loc[index, category_df.loc[j, 'topic']] = float(cos_scores[j]) else: categories_list.append(np.nan) experts_list.append(np.nan) topic_list.append(np.nan) scores_list.append(np.nan) df["Description"] = categories_list df["Expert"] = experts_list df["Topic"] = topic_list df["Score"] = scores_list return df def save_data(df, filename): df = df.drop(columns=['Embeddings']) new_filename = filename.replace(".", "_classified.") df.to_excel(new_filename, index=False) return new_filename def classification(column, file_path, categories, treshold): # Load data df = load_data(file_path) # Initialize models model_ST = initialize_models() print('Generating Embeddings') # Generate embeddings for df df = generate_embeddings(df, model_ST, column) print('Embeddings Generated') category_df = process_categories(categories, model_ST) # Match categories df = match_categories(df, category_df, treshold=treshold) # Save data return save_data(df,file_path), df def download_cate(cate_df): cate_df.to_excel('categories.xlsx') return gr.File(value='categories.xlsx', visible=True)