Standard_Intelligence_Dev

Running

App Files Files Community

Standard_Intelligence_Dev / classification.py

heymenn

Update classification.py

3049ae8 verified 6 months ago

raw

history blame

9.73 kB

	import pandas as pd
	from transformers import AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer, util
	import numpy as np
	import torch
	import gradio as gr


	# Categories
	categories = [
	{
	"topic": "Confidentiality and Privacy Protection",
	"description": "This topic covers the protection of confidentiality, privacy, and integrity in security systems. It also includes authentication and authorization processes.",
	"experts": ["Mireille"]
	},
	{
	"topic": "Distributed Trust and End-User Trust Models",
	"description": "This topic focuses on distributed trust models and how end-users establish trust in secure systems.",
	"experts": ["Mireille", "Khawla"]
	},
	{
	"topic": "Secure Element and Key Provisioning",
	"description": "This topic involves the secure element in systems and the process of key provisioning.",
	"experts": ["Mireille"]
	},
	{
	"topic": "Residential Gateway Security",
	"description": "This topic covers the security aspects of Residential Gateways.",
	"experts": ["Mireille"]
	},
	{
	"topic": "Standalone Non-Public Network (SNPN) Inter-Connection and Cybersecurity",
	"description": "This topic focuses on the inter-connection of Standalone Non-Public Networks and related cyber-security topics.",
	"experts": ["Khawla"]
	},
	{
	"topic": "Distributed Ledger and Blockchain in SNPN",
	"description": "This topic covers the use of distributed ledger technology and blockchain in securing Standalone Non-Public Networks.",
	"experts": ["Khawla"]
	},
	{
	"topic": "Distributed Networks and Communication",
	"description": "This topic involves distributed networks such as mesh networks, ad-hoc networks, and multi-hop networks, and their cyber-security aspects.",
	"experts": ["Guillaume"]
	},
	{
	"topic": "Swarm of Drones and Unmanned Aerial Vehicles Network Infrastructure",
	"description": "This topic covers the network infrastructure deployed by Swarm of Drones and Unmanned Aerial Vehicles.",
	"experts": ["Guillaume"]
	},
	{
	"topic": "USIM and Over-the-Air Services",
	"description": "This topic involves USIM and related over-the-air services such as Steering of Roaming, roaming services, network selection, and UE configuration.",
	"experts": ["Vincent"]
	},
	{
	"topic": "Eco-Design and Societal Impact of Technology",
	"description": "This topic covers eco-design concepts, including energy saving, energy efficiency, carbon emissions, and the societal impact of technology.",
	"experts": ["Pierre"]
	},
	{
	"topic": "Service Requirements of New Services",
	"description": "This topic involves defining service requirements for new services, detecting low signals of new trends and technologies, and assessing their impact on USIM services or over-the-air services.",
	"experts": ["Ly-Thanh"]
	},
	{
	"topic": "Satellite and Non Terrestrial Networks",
	"description": "This topic covers satellite networks, Non Terrestrial Networks, Private Networks, IoT, Inter Satellite communication, and Radio Access Network.",
	"experts": ["Nicolas"]
	},
	{
	"topic": "Public Safety and Emergency Communication",
	"description": "This topic involves Public Safety Communication, Military Communication, Emergency Calls, Emergency Services, Disaster Communication Access, and other related areas.",
	"experts": ["Dorin"]
	},
	{
	"topic": "Identifying the Human User of a Subscription",
	"description": "This topic involves methods and processes for identifying the human user associated with a subscription.",
	"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
	},
	{
	"topic": "Authentication and Authorization of Users and Restrictions on Users",
	"description": "This topic covers authentication and authorization processes, as well as restrictions imposed on users.",
	"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
	},
	{
	"topic": "Exposure of User Identity Profile Information",
	"description": "This topic involves the exposure of user identity profile information and its security implications.",
	"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
	},
	{
	"topic": "Identifying non-3GPP Devices Connecting behind a UE or 5G-RG",
	"description": "This topic involves identifying non-3GPP devices connecting behind a UE (User Equipment) or 5G-RG (5G Residential Gateway).",
	"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
	}
	]

	def add_categories(df,df_all):
	categories = df.to_dict("records")
	categories_all = df_all.to_dict("list")
	for cat in categories:
	if cat['topic'] not in categories_all['topic']:
	categories_all['topic'].append(cat['topic'])
	categories_all['description'].append(cat['description'])
	categories_all['experts'].append(cat['experts'])
	print(f"AFTER ADDINGS Those are the categories_all : {categories_all}")

	return gr.update(choices=categories_all['topic']),pd.DataFrame.from_dict(categories_all)

	df_cate = pd.DataFrame(categories)
	df_cat_filter = df_cate.to_dict("list")["topic"]

	def filter_by_topics(filters, categories):
	value_filtered = []
	categories = categories.to_dict("records")
	for cat in categories:
	if cat['topic'] in filters:
	value_filtered.append(cat)

	return gr.DataFrame(label='categories', value=pd.DataFrame(value_filtered), interactive=True)

	### End

	def reset_cate(df_categories):
	if df_categories.equals(df_cate):
	df_categories = pd.DataFrame([['', '', '']], columns=['topic', 'description', 'expert'])
	else:
	df_categories = df_cate.copy()
	return df_categories


	def load_data(file_obj):
	# Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it
	return pd.read_excel(file_obj)


	def initialize_models():
	model_ST = SentenceTransformer("all-mpnet-base-v2")
	return model_ST


	def generate_embeddings(df, model, Column):
	embeddings_list = []
	for index, row in df.iterrows():
	if type(row[Column]) == str:
	print(index)
	if 'Title' in df.columns:
	if type(row["Title"]) == str:
	content = row["Title"] + "\n" + row[Column]
	else:
	content = row[Column]
	else:
	content = row[Column]
	embeddings = model.encode(content, convert_to_tensor=True)
	embeddings_list.append(embeddings)
	else:
	embeddings_list.append(np.nan)
	df['Embeddings'] = embeddings_list
	return df


	def process_categories(categories, model):
	# Create a new DataFrame to store category information and embeddings
	df_cate = pd.DataFrame(categories)

	# Generate embeddings for each category description
	df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1)

	return df_cate



	def match_categories(df, category_df, treshold=0.45):

	categories_list, experts_list, topic_list, scores_list = [], [], [], []
	for ebd_content in df['Embeddings']:
	if isinstance(ebd_content, torch.Tensor):
	cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
	high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]

	# Append the corresponding categories, experts, and topics for each high-scoring index
	categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
	experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
	topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
	scores_list.append([float(cos_scores[index]) for index in high_score_indices])
	else:
	categories_list.append(np.nan)
	experts_list.append(np.nan)
	topic_list.append(np.nan)
	scores_list.append('pas interessant')

	df["Description"] = categories_list
	df["Expert"] = experts_list
	df["Topic"] = topic_list
	df["Score"] = scores_list
	return df

	def flatten_nested_lists(nested_list):
	"""Flatten a list of potentially nested lists into a single list."""
	flattened_list = []
	for item in nested_list:
	if isinstance(item, list):
	flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list
	else:
	flattened_list.append(item)
	return flattened_list

	def save_data(df, filename):
	# Apply flattening and then join for the 'Expert' column
	df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
	df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
	df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
	df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)

	df = df.drop(columns=['Embeddings'])
	new_filename = filename.replace(".", "_classified.")
	df.to_excel(new_filename, index=False)
	return new_filename

	def classification(column, file_path, categories, treshold):
	# Load data
	df = load_data(file_path)

	# Initialize models
	model_ST = initialize_models()

	# Generate embeddings for df
	df = generate_embeddings(df, model_ST, column)


	category_df = process_categories(categories, model_ST)

	# Match categories
	df = match_categories(df, category_df, treshold=treshold)

	# Save data
	return save_data(df,file_path), df