Spaces:
Sleeping
Sleeping
import pandas as pd | |
from catboost import Pool | |
import joblib | |
import torch | |
import re | |
from llm import TransformerRegrModel | |
class VacancyAnalyzer: | |
def __init__(self, transformer_path: str, catboost_path: str, inputs: dict): | |
self.transformer_path = transformer_path | |
self.catboost_path = catboost_path | |
self.inputs = pd.DataFrame(inputs, index=[0]).drop(columns=['conversion', 'conversion_class', 'id'], axis=1) | |
self.cat_features = ['profession', 'grade', 'location'] | |
self.text_features = ['emp_brand', 'mandatory', 'additional', 'comp_stages', 'work_conditions'] | |
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
def __cleaner__(self, txt: str) -> str: | |
txt = re.sub(r'\_(.*?)\_', r'', txt) | |
txt = re.sub(r'([\n\t]*)', r'', txt) | |
return txt | |
def predict(self) -> float: | |
df = self.inputs.drop(columns=self.text_features, axis=1) | |
pool = Pool(df, cat_features=self.cat_features) | |
regressor = joblib.load(self.catboost_path) | |
prediction = regressor.predict(pool).tolist() | |
return prediction[0] | |
def classify(self) -> tuple: | |
df = self.inputs[self.text_features] | |
description = df[self.text_features[0]].values[0] + ' ' | |
for t in self.text_features[1:]: | |
description += df[t].values[0] | |
description += ' ' | |
description = self.__cleaner__(description) | |
if len(description) < 100: | |
return 'Too short text', 'unknown' | |
tbert = TransformerRegrModel('rubert', 3) | |
tbert.load_state_dict(torch.load(self.transformer_path, map_location=torch.device(self.device))) | |
tbert.to(self.device) | |
tbert.eval() | |
with torch.no_grad(): | |
outputs, _, _ = tbert(description) | |
prediction = torch.argmax(outputs, 1).cpu().numpy() | |
return 'Text analyzing finished', prediction | |