Christopher Glaze
commited on
Commit
•
b208d2e
1
Parent(s):
34c2f2c
Initial commit of model files.
Browse files- .gitignore +1 -0
- handler.py +138 -0
- instruction_classification_pipeline.joblib +3 -0
- instruction_label_map.json +1 -0
- requirements.txt +7 -0
- response_quality_pipeline.joblib +3 -0
- stop_words.json +1 -0
- tests.py +13 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
**/__pycache__
|
handler.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from typing import Dict, List, Any, Union, Optional
|
3 |
+
from pathlib import Path
|
4 |
+
import json
|
5 |
+
import joblib
|
6 |
+
import pandas as pd
|
7 |
+
import nltk
|
8 |
+
from transformers import AutoModel, AutoTokenizer
|
9 |
+
import torch
|
10 |
+
import numpy as np
|
11 |
+
from sklearn.base import TransformerMixin
|
12 |
+
|
13 |
+
class SimcseGenerator(TransformerMixin):
|
14 |
+
def __init__(
|
15 |
+
self, device: str ='cpu', batch_size: int =16, model_name: str = "princeton-nlp/unsup-simcse-bert-base-uncased"
|
16 |
+
) -> None:
|
17 |
+
|
18 |
+
self.model_name = model_name
|
19 |
+
self.device = torch.device(device)
|
20 |
+
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
22 |
+
model = AutoModel.from_pretrained(model_name).to(self.device)
|
23 |
+
|
24 |
+
self.tokenizer = tokenizer
|
25 |
+
self.model = model
|
26 |
+
self.batch_size = batch_size
|
27 |
+
|
28 |
+
def transform(self, X: np.ndarray) -> np.ndarray:
|
29 |
+
batch_size = (
|
30 |
+
16 # any larger, and we risk running out of memory on EC2 dev instances
|
31 |
+
)
|
32 |
+
|
33 |
+
embeddings = []
|
34 |
+
|
35 |
+
for start in range(0, len(X), batch_size):
|
36 |
+
end = min(len(X), start + batch_size)
|
37 |
+
inputs = self.tokenizer(
|
38 |
+
X[start:end],
|
39 |
+
padding=True,
|
40 |
+
truncation=True,
|
41 |
+
return_tensors="pt",
|
42 |
+
)
|
43 |
+
with torch.no_grad():
|
44 |
+
inputs = inputs.to(self.device)
|
45 |
+
batch_embeddings = self.model(
|
46 |
+
**inputs, output_hidden_states=True, return_dict=True
|
47 |
+
).pooler_output
|
48 |
+
embeddings.append(batch_embeddings.cpu().detach().numpy())
|
49 |
+
|
50 |
+
embeddings = np.concatenate(embeddings)
|
51 |
+
embeddings /= np.sqrt(np.square(embeddings).sum(axis=1))[:,np.newaxis]
|
52 |
+
|
53 |
+
return embeddings
|
54 |
+
|
55 |
+
class EndpointHandler():
|
56 |
+
def __init__(self, device: str = "cpu"):
|
57 |
+
# Preload all the elements you are going to need at inference.
|
58 |
+
# pseudo:
|
59 |
+
# self.model= load_model(path)
|
60 |
+
|
61 |
+
local_path = Path(__file__).parent
|
62 |
+
self.device = device
|
63 |
+
with open(local_path/'stop_words.json','r') as fp:
|
64 |
+
self.stop_words = set(json.load(fp))
|
65 |
+
|
66 |
+
with open(local_path/'instruction_label_map.json','r') as fp:
|
67 |
+
self.instruction_label_map = json.load(fp)
|
68 |
+
self.instruction_label_map = {int(k):v for k,v in self.instruction_label_map.items()}
|
69 |
+
|
70 |
+
self.instruction_pipeline = joblib.load(local_path/'instruction_classification_pipeline.joblib')
|
71 |
+
self.response_pipeline = joblib.load(local_path/'response_quality_pipeline.joblib')
|
72 |
+
|
73 |
+
self.simcse_generator = SimcseGenerator(device=self.device)
|
74 |
+
|
75 |
+
def _get_stop_word_proportion(self, s):
|
76 |
+
s = s.lower()
|
77 |
+
try:
|
78 |
+
words = nltk.tokenize.word_tokenize(s)
|
79 |
+
except:
|
80 |
+
words = nltk.tokenize.word_tokenize(s[1:])
|
81 |
+
|
82 |
+
if len(words)==0:
|
83 |
+
return 0
|
84 |
+
else:
|
85 |
+
return sum(x in self.stop_words for x in words) / len(words)
|
86 |
+
|
87 |
+
|
88 |
+
def predict_instruction_classes(self, df: pd.DataFrame) -> np.ndarray:
|
89 |
+
instruction_classes = self.instruction_pipeline.predict(df)
|
90 |
+
instruction_class_confidence = self.instruction_pipeline.predict_proba(df).max(axis=1)
|
91 |
+
return np.array(list(map(lambda x: self.instruction_label_map[x], instruction_classes))), instruction_class_confidence
|
92 |
+
|
93 |
+
def compute_response_quality_feature_space(self, df: pd.DataFrame, instruction_classes: Optional[np.ndarray] = None):
|
94 |
+
|
95 |
+
if instruction_classes is None:
|
96 |
+
instruction_classes, _ = self.predict_instruction_classes(df)
|
97 |
+
|
98 |
+
instruction_class_set = [self.instruction_label_map[i] for i in range(len(self.instruction_label_map))]
|
99 |
+
|
100 |
+
instruction_classes_onehot = pd.DataFrame(instruction_classes[:,np.newaxis]==np.array(instruction_class_set)[np.newaxis,:], columns=instruction_class_set).astype(float)
|
101 |
+
|
102 |
+
df1 = pd.concat([df,instruction_classes_onehot], axis=1)
|
103 |
+
|
104 |
+
df1['instruction_response_similarity'] = (self.simcse_generator.transform(df['instruction'].tolist()) * self.simcse_generator.transform(df['response'].tolist())).sum(axis=1)
|
105 |
+
|
106 |
+
df1['token_number'] = df1['response'].str.split().apply(len)
|
107 |
+
df1['stop_word_proportion'] = df1['response'].apply(self._get_stop_word_proportion)
|
108 |
+
|
109 |
+
return df1
|
110 |
+
|
111 |
+
def predict_response_quality(self, df, instruction_classes):
|
112 |
+
df1 = self.compute_response_quality_feature_space(df, instruction_classes)
|
113 |
+
return self.response_pipeline.predict_proba(df1)[:,1]
|
114 |
+
|
115 |
+
|
116 |
+
def __call__(self, df: Union[pd.DataFrame, Dict]):
|
117 |
+
|
118 |
+
is_dict = isinstance(df, dict)
|
119 |
+
|
120 |
+
if is_dict:
|
121 |
+
df = pd.DataFrame([df])
|
122 |
+
|
123 |
+
if 'dataset' not in df.columns:
|
124 |
+
df['dataset'] = ''
|
125 |
+
|
126 |
+
instruction_classes, instruction_class_confidences = self.predict_instruction_classes(df)
|
127 |
+
|
128 |
+
predictions = [{'instruction class': instruction_class, 'instruction class confidence': instruction_class_confidence} for instruction_class, instruction_class_confidence in zip(instruction_classes, instruction_class_confidences)]
|
129 |
+
|
130 |
+
if 'response' in df.columns:
|
131 |
+
response_qualities = self.predict_response_quality(df, instruction_classes)
|
132 |
+
for i,response_quality in enumerate(response_qualities):
|
133 |
+
predictions[i].update({'response quality': response_quality})
|
134 |
+
|
135 |
+
if is_dict:
|
136 |
+
return predictions[0]
|
137 |
+
else:
|
138 |
+
return pd.DataFrame(predictions, index=df.index)
|
instruction_classification_pipeline.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bba5f13a13cdd94cb631f160c9e09ef3c93c6ead7622ec4b0b960a17ff7487c
|
3 |
+
size 21068584
|
instruction_label_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"0": "open-qa", "1": "closed-qa", "2": "summarization", "3": "generation", "4": "brainstorming", "5": "other"}
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.24.2
|
2 |
+
pandas==1.5.3
|
3 |
+
torch==2.0.1
|
4 |
+
transformers==4.30.2
|
5 |
+
nltk==3.8.1
|
6 |
+
scikit-learn==1.3.0
|
7 |
+
xgboost==1.7.6
|
response_quality_pipeline.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a116abfb535671bb07da6ca5f211f671f2aeb856b093a8a8d5c62df17d4ed5a3
|
3 |
+
size 37881144
|
stop_words.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["y", "own", "here", "hadn", "our", "s", "yours", "you'll", "those", "having", "between", "ain", "each", "haven't", "needn", "under", "can", "before", "but", "because", "me", "hasn't", "against", "then", "only", "at", "just", "weren", "of", "above", "as", "further", "myself", "you", "some", "yourself", "from", "out", "shouldn", "while", "be", "wouldn", "into", "her", "mightn't", "their", "m", "same", "any", "wasn", "if", "who", "for", "t", "shan't", "mightn", "have", "i", "he", "a", "are", "does", "over", "until", "its", "my", "couldn't", "you'd", "to", "not", "ma", "whom", "won't", "the", "being", "shouldn't", "ourselves", "isn", "about", "did", "shan", "after", "didn't", "them", "once", "all", "too", "she's", "or", "in", "on", "am", "mustn't", "than", "that", "few", "that'll", "ve", "with", "couldn", "itself", "down", "it's", "where", "such", "isn't", "didn", "again", "will", "when", "through", "him", "mustn", "won", "doesn", "o", "herself", "weren't", "themselves", "which", "don", "wasn't", "below", "most", "re", "what", "your", "ours", "you're", "she", "his", "himself", "up", "yourselves", "should've", "this", "is", "you've", "d", "very", "these", "more", "off", "doing", "hers", "and", "both", "how", "aren't", "do", "needn't", "it", "don't", "no", "haven", "were", "by", "hasn", "during", "wouldn't", "aren", "had", "should", "has", "been", "other", "ll", "theirs", "why", "so", "doesn't", "an", "hadn't", "nor", "was", "there", "now", "they", "we"]
|
tests.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from handler import EndpointHandler
|
2 |
+
|
3 |
+
# init handler
|
4 |
+
response_model_handler = EndpointHandler()
|
5 |
+
|
6 |
+
# prepare sample payload
|
7 |
+
payload = {"instruction": "What are some ways to stay energized throughout the day?",
|
8 |
+
"response": "Drink lots of coffee!"}
|
9 |
+
|
10 |
+
# test the handler
|
11 |
+
pred=response_model_handler(payload)
|
12 |
+
|
13 |
+
print(pred)
|