Spaces:
Sleeping
Sleeping
Update classification.py
Browse files- classification.py +51 -56
classification.py
CHANGED
@@ -8,12 +8,18 @@ def load_data(file_obj):
|
|
8 |
# Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it
|
9 |
return pd.read_excel(file_obj)
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def generate_embeddings(df, model, Column):
|
12 |
embeddings_list = []
|
13 |
for index, row in df.iterrows():
|
14 |
-
if type(row["
|
15 |
print(index)
|
16 |
-
content = row["
|
17 |
embeddings = model.encode(content, convert_to_tensor=True)
|
18 |
embeddings_list.append(embeddings)
|
19 |
else:
|
@@ -23,89 +29,78 @@ def generate_embeddings(df, model, Column):
|
|
23 |
|
24 |
|
25 |
def process_categories(categories, model):
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
return
|
|
|
33 |
|
34 |
|
35 |
def match_categories(df, category_df):
|
36 |
-
|
|
|
37 |
for ebd_content in df['Embeddings']:
|
38 |
-
if
|
39 |
cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
44 |
else:
|
45 |
categories_list.append(np.nan)
|
46 |
experts_list.append(np.nan)
|
47 |
-
|
48 |
-
|
|
|
|
|
49 |
df["Expert"] = experts_list
|
|
|
50 |
df["Score"] = scores_list
|
51 |
return df
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
def save_data(df, filename):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
new_filename = filename.replace(".", "_classified.")
|
56 |
df.to_excel(new_filename, index=False)
|
57 |
return new_filename
|
58 |
|
59 |
-
|
60 |
-
def classification(column, file_path):
|
61 |
# Load data
|
62 |
df = load_data(file_path)
|
63 |
|
64 |
# Initialize models
|
65 |
-
model_ST =
|
66 |
|
67 |
# Generate embeddings for df
|
68 |
df = generate_embeddings(df, model_ST, column)
|
69 |
|
70 |
-
|
71 |
-
categories = [
|
72 |
-
{
|
73 |
-
"expert": "mireille",
|
74 |
-
"bio":"expert in security, interested in protection of confidentiality, privacy, integrity, authentication and authorization. Also interested in distributed trust, end-user trust models, secure element, key provisioning, Residential Gateway"
|
75 |
-
},
|
76 |
-
{
|
77 |
-
"expert":"khawla",
|
78 |
-
"bio":"expert in inter-connection of Standalone Non-Public Network (SNPN) and cyber-security related topics of such types of networks including distributed trust, distributed ledge, blockchain, authentication, private networks security, provisioning of credentials"
|
79 |
-
},
|
80 |
-
{
|
81 |
-
"expert":"guillaume",
|
82 |
-
"bio":"expert in distributed networks and communication, such as mesh network, ad-hoc networks multi-hop network, and the cyber-security of such topologies. Swarm of Drones and Unmanned Aerial Vehicles may deployed such network infrastructure. It is essential to look at how devices/UE authenticate to these networks, and assess the threats and provide counter measures"
|
83 |
-
},
|
84 |
-
{
|
85 |
-
"expert":"vincent",
|
86 |
-
"bio":"expert in USIM and related over-the-air services to manage the USIM e.g. Steering of Roaming (SoR), roaming services, network selection, UE configuration or configuration in the Secure Element or USIM"
|
87 |
-
},
|
88 |
-
{
|
89 |
-
"expert":"pierre",
|
90 |
-
"bio":"expert in eco-design, intereted in societal impact of technology, wants to push Key Value concepts to 3GPP and in particular defines Key Value Indicators (KVI) in the service requirements. Energy saving and energy efficiency are key aspects in eco-design, as well as carbon emissions and global use of the telecommunication technologies"
|
91 |
-
},
|
92 |
-
{
|
93 |
-
"expert":"ly-thanh",
|
94 |
-
"bio":"expert in service requirements of new services defines in new Study Items (SID) ad Work Items (WID). Has to detect low signals of new trends and technologies e.g. Artificial Intelligence (AI/ML), Metaverse new trust concepts, new network topologies, and new topics that may have an impact on the USIM services or over-the-air services. Thes impacts may by new threats or opportunities for the USIM/Secure Element/Card/Roaming services business."
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"expert":"nicolas",
|
98 |
-
"bio":"expert in satellite, and Non Terrestrial Network NTN, is interested in Private Networks SNPN, IoT, Inter Satellite communication, Geo Stationnary Satellite GEO, Low Orbite Satellite LEO, Medium Orbite Satellite MEO, Radio Access Network RAN"
|
99 |
-
},
|
100 |
-
{
|
101 |
-
"expert":"dorin",
|
102 |
-
"bio":"Public Safety Communication, Military Communication, Emeregency Calls, Emergency Services, Disaster Communication Access, PLMN Access During Disasters, Emergency Communication Enhancements, Ultra reliable low latency communication URLLC, Tactical Bubble, Private Network, Proximity Services PROSE, Radio Access Network RAN, Mission Critical Services MCS"
|
103 |
-
}
|
104 |
-
]
|
105 |
category_df = process_categories(categories, model_ST)
|
106 |
|
107 |
# Match categories
|
108 |
df = match_categories(df, category_df)
|
109 |
|
110 |
# Save data
|
111 |
-
return save_data(df,file_path), df
|
|
|
8 |
# Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it
|
9 |
return pd.read_excel(file_obj)
|
10 |
|
11 |
+
|
12 |
+
def initialize_models():
|
13 |
+
model_ST = SentenceTransformer("all-mpnet-base-v2")
|
14 |
+
return model_ST
|
15 |
+
|
16 |
+
|
17 |
def generate_embeddings(df, model, Column):
|
18 |
embeddings_list = []
|
19 |
for index, row in df.iterrows():
|
20 |
+
if type(row["Title"]) == str and type(row[Column]) == str:
|
21 |
print(index)
|
22 |
+
content = row["Title"] + "\n" + row[Column]
|
23 |
embeddings = model.encode(content, convert_to_tensor=True)
|
24 |
embeddings_list.append(embeddings)
|
25 |
else:
|
|
|
29 |
|
30 |
|
31 |
def process_categories(categories, model):
|
32 |
+
# Create a new DataFrame to store category information and embeddings
|
33 |
+
df_cate = pd.DataFrame(categories)
|
34 |
+
|
35 |
+
# Generate embeddings for each category description
|
36 |
+
df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1)
|
37 |
+
|
38 |
+
return df_cate
|
39 |
+
|
40 |
|
41 |
|
42 |
def match_categories(df, category_df):
|
43 |
+
|
44 |
+
categories_list, experts_list, topic_list, scores_list = [], [], [], []
|
45 |
for ebd_content in df['Embeddings']:
|
46 |
+
if isinstance(ebd_content, torch.Tensor):
|
47 |
cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
|
48 |
+
high_score_indices = [i for i, score in enumerate(cos_scores) if score > 0.45]
|
49 |
+
|
50 |
+
# Append the corresponding categories, experts, and topics for each high-scoring index
|
51 |
+
categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
|
52 |
+
experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
|
53 |
+
topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
|
54 |
+
scores_list.append([float(cos_scores[index]) for index in high_score_indices])
|
55 |
else:
|
56 |
categories_list.append(np.nan)
|
57 |
experts_list.append(np.nan)
|
58 |
+
topic_list.append(np.nan)
|
59 |
+
scores_list.append('pas interessant')
|
60 |
+
|
61 |
+
df["Description"] = categories_list
|
62 |
df["Expert"] = experts_list
|
63 |
+
df["Topic"] = topic_list
|
64 |
df["Score"] = scores_list
|
65 |
return df
|
66 |
|
67 |
+
def flatten_nested_lists(nested_list):
|
68 |
+
"""Flatten a list of potentially nested lists into a single list."""
|
69 |
+
flattened_list = []
|
70 |
+
for item in nested_list:
|
71 |
+
if isinstance(item, list):
|
72 |
+
flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list
|
73 |
+
else:
|
74 |
+
flattened_list.append(item)
|
75 |
+
return flattened_list
|
76 |
|
77 |
def save_data(df, filename):
|
78 |
+
# Apply flattening and then join for the 'Expert' column
|
79 |
+
df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
|
80 |
+
df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
|
81 |
+
df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
|
82 |
+
df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
|
83 |
+
|
84 |
+
|
85 |
new_filename = filename.replace(".", "_classified.")
|
86 |
df.to_excel(new_filename, index=False)
|
87 |
return new_filename
|
88 |
|
89 |
+
def classification(column, file_path, categories):
|
|
|
90 |
# Load data
|
91 |
df = load_data(file_path)
|
92 |
|
93 |
# Initialize models
|
94 |
+
model_ST = initialize_models()
|
95 |
|
96 |
# Generate embeddings for df
|
97 |
df = generate_embeddings(df, model_ST, column)
|
98 |
|
99 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
category_df = process_categories(categories, model_ST)
|
101 |
|
102 |
# Match categories
|
103 |
df = match_categories(df, category_df)
|
104 |
|
105 |
# Save data
|
106 |
+
return save_data(df,file_path), df
|