llm4career / ml.py
fkonovalenko's picture
first commit
d99e452
raw
history blame
1.56 kB
import pandas as pd
import joblib
import torch
import re
from llm import TransformerRegrModel
class VacancyAnalyzer:
def __init__(self, transformer_path: str, inputs: dict):
self.transformer_path = transformer_path
self.inputs = pd.DataFrame(inputs, index=[0]).drop(columns=['conversion', 'conversion_class', 'id'], axis=1)
self.cat_features = ['profession', 'grade', 'location']
self.text_features = ['emp_brand', 'mandatory', 'additional', 'comp_stages', 'work_conditions']
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def __cleaner__(self, txt: str) -> str:
txt = re.sub(r'\_(.*?)\_', r'', txt)
txt = re.sub(r'([\n\t]*)', r'', txt)
return txt
def classify(self) -> tuple:
df = self.inputs[self.text_features]
description = df[self.text_features[0]].values[0] + ' '
for t in self.text_features[1:]:
description += df[t].values[0]
description += ' '
description = self.__cleaner__(description)
if len(description) < 100:
return 'Too short text', 'unknown'
tbert = TransformerRegrModel('rubert', 3)
tbert.load_state_dict(torch.load(self.transformer_path, map_location=torch.device(self.device)))
tbert.to(self.device)
tbert.eval()
with torch.no_grad():
outputs, _, _ = tbert(description)
prediction = torch.argmax(outputs, 1).cpu().numpy()
return 'Text analyzing finished', prediction