danieldux commited on
Commit
8d30fa7
1 Parent(s): 2030295

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -1
app.py CHANGED
@@ -1,3 +1,59 @@
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- gr.load("models/BAAI/bge-m3").launch()
 
 
1
+ # import gradio as gr
2
+
3
+ # gr.load("models/BAAI/bge-m3").launch()
4
+
5
+ import json
6
+ import faiss
7
+ import numpy as np
8
  import gradio as gr
9
+ from FlagEmbedding import BGEM3FlagModel
10
+
11
+ # Define a function to load the ISCO taxonomy
12
+ def load_isco_taxonomy(file_path: str) -> list:
13
+ with open(file_path, 'r', encoding='utf-8') as file:
14
+ isco_data = [json.loads(line.strip()) for line in file]
15
+ return isco_data
16
+
17
+ # Define a function to create a FAISS index
18
+ def create_faiss_index(isco_taxonomy, model_name='BAAI/bge-m3'):
19
+ model = BGEM3FlagModel(model_name, use_fp16=True)
20
+ texts = [str(entry['ESCO_DESCRIPTION']) for entry in isco_taxonomy]
21
+ embeddings = model.encode(texts, batch_size=12, max_length=256)['dense_vecs']
22
+ embeddings = np.array(embeddings).astype('float32')
23
+ dimension = embeddings.shape[1]
24
+ index = faiss.IndexFlatL2(dimension)
25
+ index.add(embeddings)
26
+ faiss.write_index(index, 'isco_taxonomy.index')
27
+ with open('isco_taxonomy_mapping.json', 'w') as f:
28
+ json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f)
29
+
30
+ # Define a function to retrieve and rerank using FAISS
31
+ def retrieve_and_rerank_faiss(job_duties, model_name="BAAI/bge-m3", top_k=4):
32
+ # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index
33
+ if not os.path.exists("isco_taxonomy.index"):
34
+ isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl')
35
+ create_faiss_index(isco_taxonomy)
36
+ index = faiss.read_index("isco_taxonomy.index")
37
+ with open("isco_taxonomy_mapping.json", "r") as f:
38
+ isco_taxonomy = json.load(f)
39
+ model = BGEM3FlagModel(model_name, use_fp16=True)
40
+ query_embedding = model.encode([job_duties], max_length=256)["dense_vecs"]
41
+ query_embedding = np.array(query_embedding).astype("float32")
42
+ distances, indices = index.search(query_embedding, top_k)
43
+ results = [
44
+ (isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"], distances[0][i])
45
+ for i, idx in enumerate(indices[0])
46
+ ]
47
+ return results
48
+
49
+ # Load data and create index (should be done once and then commented out or moved to a setup script)
50
+ # isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl')
51
+ # create_faiss_index(isco_taxonomy)
52
+
53
+ # Gradio Interface
54
+ def gradio_interface(job_duties):
55
+ results = retrieve_and_rerank_faiss(job_duties)
56
+ return [f"Description: {desc}, Distance: {dist}" for desc, dist in results]
57
 
58
+ iface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text", title="Job Duties to ISCO Descriptions")
59
+ iface.launch()