fkonovalenko commited on
Commit
e6dc8c2
1 Parent(s): ea8ee61

first commit

Browse files
Files changed (4) hide show
  1. app.py +148 -0
  2. llm.py +43 -0
  3. ml.py +47 -0
  4. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import shutil
4
+ import json
5
+ from ml import VacancyAnalyzer
6
+
7
+
8
+ class GlobalState:
9
+ """
10
+ Class to store global variables
11
+ """
12
+ result_file_path = os.path.join(os.path.dirname(__file__), 'result/archive.json')
13
+ result_dir = os.path.join(os.path.dirname(__file__), 'result')
14
+ bert_path = os.path.join(os.path.dirname(__file__), 'tiny.pt')
15
+ catboost_path = os.path.join(os.path.dirname(__file__), 'best_cat.joblib')
16
+ conv_classes = {0: 'low',
17
+ 1: 'middle',
18
+ 2: 'high'
19
+ }
20
+ default_data = {'id': 'a0000',
21
+ 'emp_brand': '',
22
+ 'mandatory': '',
23
+ 'additional': '',
24
+ 'comp_stages': '',
25
+ 'work_conditions': '',
26
+ 'conversion': 0,
27
+ 'conversion_class': 'unknown'
28
+ }
29
+ data = None
30
+
31
+
32
+ def cid(txt):
33
+ GlobalState.data['id'] = txt
34
+
35
+
36
+ def cbrand(txt):
37
+ GlobalState.data['emp_brand'] = txt
38
+
39
+
40
+ def cmand(txt):
41
+ GlobalState.data['mandatory'] = txt
42
+
43
+
44
+ def cadd(txt):
45
+ GlobalState.data['additional'] = txt
46
+
47
+
48
+ def ccomp(txt):
49
+ GlobalState.data['comp_stages'] = txt
50
+
51
+
52
+ def ccond(txt):
53
+ GlobalState.data['work_conditions'] = txt
54
+
55
+
56
+ def submit(chk):
57
+ # print(GlobalState.data)
58
+ return gr.update("Run!", visible=True)
59
+
60
+
61
+ def append_to_json(_dict, path):
62
+ with open(path, 'ab+') as f:
63
+ f.seek(0, 2)
64
+ if f.tell() == 0:
65
+ f.write(json.dumps([_dict]).encode())
66
+ else:
67
+ f.seek(-1, 2)
68
+ f.truncate()
69
+ f.write(' , '.encode())
70
+ f.write(json.dumps(_dict).encode())
71
+ f.write(']'.encode())
72
+
73
+
74
+ def predict(btn):
75
+ analyzer = VacancyAnalyzer(GlobalState.bert_path, GlobalState.catboost_path, GlobalState.data)
76
+ status, result = analyzer.classify()
77
+ gr.Info(status)
78
+ if result != 'unknown':
79
+ result = GlobalState.conv_classes[int(result[0])]
80
+ out_2 = f'Predicted by vacancy description conversion - {result}'
81
+ GlobalState.data['conversion_class'] = result
82
+ fid = GlobalState.result_file_path
83
+ append_to_json(GlobalState.data, fid)
84
+ GlobalState.data = GlobalState.default_data
85
+ link = GlobalState.result_file_path
86
+ return gr.update(value=out_2), gr.update(link="/file=" + link, visible=True)
87
+
88
+
89
+ def save(btn):
90
+ link = GlobalState.result_file_path
91
+ return gr.update(link="/file=" + link)
92
+
93
+
94
+ def main():
95
+ shutil.rmtree(os.path.join(os.path.dirname(__file__), 'result/'), ignore_errors=True)
96
+ os.mkdir(os.path.join(os.path.dirname(__file__), 'result/'))
97
+ GlobalState.data = GlobalState.default_data
98
+ with gr.Blocks() as demo:
99
+ with gr.Tab("Load"):
100
+ with gr.Row():
101
+ gr.Markdown(
102
+ """
103
+ # Input the text description of the position
104
+ # 👾👾👾 Then press **Run!** 👾👾👾
105
+ """)
106
+ with gr.Row():
107
+ with gr.Column():
108
+ with gr.Row():
109
+ brand = gr.Textbox(label='Company name', value=None)
110
+ with gr.Row():
111
+ vid = gr.Textbox(label='Position id', value=None)
112
+ with gr.Row():
113
+ req = gr.Textbox(label='Mandatory')
114
+ with gr.Column():
115
+ with gr.Row():
116
+ add = gr.Textbox(label='Additional')
117
+ with gr.Row():
118
+ comp = gr.Textbox(label='Competition stage')
119
+ with gr.Row():
120
+ cond = gr.Textbox(label='Work conditions')
121
+
122
+ with gr.Column():
123
+ with gr.Row():
124
+ with gr.Column():
125
+ ready = gr.Checkbox(label='Data Filled')
126
+ with gr.Column():
127
+ process_button = gr.Button("Run!", visible=False, interactive=True)
128
+ with gr.Row():
129
+ output_2 = gr.Textbox(label='LLM Result')
130
+ with gr.Row():
131
+ download_button = gr.Button("JSON Archive", visible=False)
132
+
133
+ brand.change(cbrand, inputs=[brand])
134
+ vid.change(cid, inputs=[vid])
135
+ req.change(cmand, inputs=[req])
136
+ add.change(cadd, inputs=[add])
137
+ comp.change(ccomp, inputs=[comp])
138
+ cond.change(ccond, inputs=[cond])
139
+ ready.change(submit, inputs=[ready], outputs=[process_button])
140
+ process_button.click(predict, inputs=[process_button], outputs=[output_2, download_button],
141
+ show_progress='full')
142
+ download_button.click(save, inputs=[download_button], outputs=[download_button])
143
+
144
+ demo.launch(share=True, allowed_paths=[GlobalState.result_dir])
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
llm.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from transformers import AutoTokenizer, AutoModel, BertConfig
4
+
5
+
6
+ class TransformerRegrModel(nn.Module):
7
+ def __init__(self, base_transformer_model: str, num_classes: int):
8
+ super().__init__()
9
+ self.tr_model = base_transformer_model
10
+ self.num = num_classes
11
+ self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
12
+
13
+ if self.tr_model not in ['rubert', 'base']:
14
+ raise Exception('unknown model')
15
+ elif self.tr_model == 'rubert':
16
+ self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
17
+ self.config = BertConfig.from_pretrained("cointegrated/rubert-tiny2", output_hidden_states=True,
18
+ output_attentions=True)
19
+ elif self.tr_model == 'base':
20
+ self.tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base", model_max_length=512)
21
+ self.config = BertConfig.from_pretrained("ai-forever/ruBert-base", output_hidden_states=True,
22
+ output_attentions=True)
23
+ self.model = AutoModel.from_config(self.config)
24
+ self.a1 = nn.ReLU()
25
+ self.classifier_1 = nn.Linear(self.model.pooler.dense.out_features, self.num)
26
+ # self.classifier_dropout = nn.Dropout(p=0.2)
27
+ # self.classifier_2 = nn.Linear(128, self.num)
28
+
29
+ def forward(self, inputs):
30
+ t = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
31
+ tokens = self.tokenizer.convert_ids_to_tokens(t['input_ids'][0])
32
+ model_output = self.model(**{k: v.to(self.device) for k, v in t.items()})
33
+ attentions = torch.cat(model_output['attentions']).to('cpu')
34
+ embeddings = model_output.last_hidden_state[:, 0, :]
35
+ embeddings = torch.nn.functional.normalize(embeddings)
36
+ outputs = self.a1(embeddings)
37
+ outputs = self.classifier_1(outputs)
38
+ # outputs = self.classifier_dropout(outputs)
39
+ # outputs = self.a1(outputs)
40
+ # outputs = self.classifier_dropout(outputs)
41
+ # outputs = self.classifier_2(outputs)
42
+
43
+ return outputs, tokens, attentions
ml.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from catboost import Pool
3
+ import joblib
4
+ import torch
5
+ import re
6
+
7
+ from llm import TransformerRegrModel
8
+
9
+
10
+ class VacancyAnalyzer:
11
+ def __init__(self, transformer_path: str, catboost_path: str, inputs: dict):
12
+ self.transformer_path = transformer_path
13
+ self.catboost_path = catboost_path
14
+ self.inputs = pd.DataFrame(inputs, index=[0]).drop(columns=['conversion', 'conversion_class', 'id'], axis=1)
15
+ self.cat_features = ['profession', 'grade', 'location']
16
+ self.text_features = ['emp_brand', 'mandatory', 'additional', 'comp_stages', 'work_conditions']
17
+ self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
18
+
19
+ def __cleaner__(self, txt: str) -> str:
20
+ txt = re.sub(r'\_(.*?)\_', r'', txt)
21
+ txt = re.sub(r'([\n\t]*)', r'', txt)
22
+ return txt
23
+
24
+ def predict(self) -> float:
25
+ df = self.inputs.drop(columns=self.text_features, axis=1)
26
+ pool = Pool(df, cat_features=self.cat_features)
27
+ regressor = joblib.load(self.catboost_path)
28
+ prediction = regressor.predict(pool).tolist()
29
+ return prediction[0]
30
+
31
+ def classify(self) -> tuple:
32
+ df = self.inputs[self.text_features]
33
+ description = df[self.text_features[0]].values[0] + ' '
34
+ for t in self.text_features[1:]:
35
+ description += df[t].values[0]
36
+ description += ' '
37
+ description = self.__cleaner__(description)
38
+ if len(description) < 100:
39
+ return 'Too short text', 'unknown'
40
+ tbert = TransformerRegrModel('rubert', 3)
41
+ tbert.load_state_dict(torch.load(self.transformer_path, map_location=torch.device(self.device)))
42
+ tbert.to(self.device)
43
+ tbert.eval()
44
+ with torch.no_grad():
45
+ outputs, _, _ = tbert(description)
46
+ prediction = torch.argmax(outputs, 1).cpu().numpy()
47
+ return 'Text analyzing finished', prediction
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas==2.0.3
2
+ joblib==1.3.2
3
+ torch==2.0.1+cpu
4
+ catboost==1.2
5
+ transformers==4.40.0
6
+ gradio==4.27.0