Initial commit of model files.

Browse files

Files changed (8) hide show

.gitignore +1 -0
handler.py +138 -0
instruction_classification_pipeline.joblib +3 -0
instruction_label_map.json +1 -0
requirements.txt +7 -0
response_quality_pipeline.joblib +3 -0
stop_words.json +1 -0
tests.py +13 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ **/__pycache__

handler.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from typing import Dict, List, Any, Union, Optional
+from pathlib import Path
+import json
+import joblib
+import pandas as pd
+import nltk
+from transformers import AutoModel, AutoTokenizer
+import torch
+import numpy as np
+from sklearn.base import TransformerMixin
+class SimcseGenerator(TransformerMixin):
+    def __init__(
+        self, device: str ='cpu', batch_size: int =16, model_name: str = "princeton-nlp/unsup-simcse-bert-base-uncased"
+    ) -> None:
+        self.model_name = model_name
+        self.device =  torch.device(device)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModel.from_pretrained(model_name).to(self.device)
+        self.tokenizer = tokenizer
+        self.model = model
+        self.batch_size = batch_size
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        batch_size = (
+            16  # any larger, and we risk running out of memory on EC2 dev instances
+        )
+        embeddings = []
+        for start in range(0, len(X), batch_size):
+            end = min(len(X), start + batch_size)
+            inputs = self.tokenizer(
+                X[start:end],
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+            )
+            with torch.no_grad():
+                inputs = inputs.to(self.device)
+                batch_embeddings = self.model(
+                    **inputs, output_hidden_states=True, return_dict=True
+                ).pooler_output
+                embeddings.append(batch_embeddings.cpu().detach().numpy())
+        embeddings = np.concatenate(embeddings)
+        embeddings /= np.sqrt(np.square(embeddings).sum(axis=1))[:,np.newaxis]
+        return embeddings
+class EndpointHandler():
+    def __init__(self, device: str = "cpu"):
+        # Preload all the elements you are going to need at inference.
+        # pseudo:
+        # self.model= load_model(path)
+        local_path = Path(__file__).parent
+        self.device = device
+        with open(local_path/'stop_words.json','r') as fp:
+            self.stop_words = set(json.load(fp))
+        with open(local_path/'instruction_label_map.json','r') as fp:
+            self.instruction_label_map = json.load(fp)
+            self.instruction_label_map = {int(k):v for k,v in self.instruction_label_map.items()}
+        self.instruction_pipeline = joblib.load(local_path/'instruction_classification_pipeline.joblib')
+        self.response_pipeline = joblib.load(local_path/'response_quality_pipeline.joblib')
+        self.simcse_generator = SimcseGenerator(device=self.device)
+    def _get_stop_word_proportion(self, s):
+        s = s.lower()
+        try:
+            words = nltk.tokenize.word_tokenize(s)
+        except:
+            words = nltk.tokenize.word_tokenize(s[1:])
+        if len(words)==0:
+            return 0
+        else:
+            return sum(x in self.stop_words for x in words) / len(words)
+    def predict_instruction_classes(self, df: pd.DataFrame) -> np.ndarray:
+        instruction_classes = self.instruction_pipeline.predict(df)
+        instruction_class_confidence = self.instruction_pipeline.predict_proba(df).max(axis=1)
+        return np.array(list(map(lambda x: self.instruction_label_map[x], instruction_classes))), instruction_class_confidence
+    def compute_response_quality_feature_space(self, df: pd.DataFrame, instruction_classes: Optional[np.ndarray] = None):
+        if instruction_classes is None:
+            instruction_classes, _ = self.predict_instruction_classes(df)
+        instruction_class_set = [self.instruction_label_map[i] for i in range(len(self.instruction_label_map))]
+        instruction_classes_onehot = pd.DataFrame(instruction_classes[:,np.newaxis]==np.array(instruction_class_set)[np.newaxis,:], columns=instruction_class_set).astype(float)
+        df1 = pd.concat([df,instruction_classes_onehot], axis=1)
+        df1['instruction_response_similarity'] = (self.simcse_generator.transform(df['instruction'].tolist()) * self.simcse_generator.transform(df['response'].tolist())).sum(axis=1)
+        df1['token_number'] = df1['response'].str.split().apply(len)
+        df1['stop_word_proportion'] = df1['response'].apply(self._get_stop_word_proportion)
+        return df1
+    def predict_response_quality(self, df, instruction_classes):
+        df1 = self.compute_response_quality_feature_space(df, instruction_classes)
+        return self.response_pipeline.predict_proba(df1)[:,1]
+    def __call__(self, df: Union[pd.DataFrame, Dict]):
+        is_dict =  isinstance(df, dict)
+        if is_dict:
+            df = pd.DataFrame([df])
+        if 'dataset' not in df.columns:
+            df['dataset'] = ''
+        instruction_classes, instruction_class_confidences = self.predict_instruction_classes(df)
+        predictions = [{'instruction class': instruction_class, 'instruction class confidence': instruction_class_confidence} for instruction_class, instruction_class_confidence in zip(instruction_classes, instruction_class_confidences)]
+        if 'response' in df.columns:
+            response_qualities = self.predict_response_quality(df, instruction_classes)
+            for i,response_quality in enumerate(response_qualities):
+                predictions[i].update({'response quality': response_quality})
+        if is_dict:
+            return predictions[0]
+        else:
+            return pd.DataFrame(predictions, index=df.index)

instruction_classification_pipeline.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bba5f13a13cdd94cb631f160c9e09ef3c93c6ead7622ec4b0b960a17ff7487c
+size 21068584

instruction_label_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"0": "open-qa", "1": "closed-qa", "2": "summarization", "3": "generation", "4": "brainstorming", "5": "other"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+numpy==1.24.2
+pandas==1.5.3
+torch==2.0.1
+transformers==4.30.2
+nltk==3.8.1
+scikit-learn==1.3.0
+xgboost==1.7.6

response_quality_pipeline.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a116abfb535671bb07da6ca5f211f671f2aeb856b093a8a8d5c62df17d4ed5a3
+size 37881144

stop_words.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["y", "own", "here", "hadn", "our", "s", "yours", "you'll", "those", "having", "between", "ain", "each", "haven't", "needn", "under", "can", "before", "but", "because", "me", "hasn't", "against", "then", "only", "at", "just", "weren", "of", "above", "as", "further", "myself", "you", "some", "yourself", "from", "out", "shouldn", "while", "be", "wouldn", "into", "her", "mightn't", "their", "m", "same", "any", "wasn", "if", "who", "for", "t", "shan't", "mightn", "have", "i", "he", "a", "are", "does", "over", "until", "its", "my", "couldn't", "you'd", "to", "not", "ma", "whom", "won't", "the", "being", "shouldn't", "ourselves", "isn", "about", "did", "shan", "after", "didn't", "them", "once", "all", "too", "she's", "or", "in", "on", "am", "mustn't", "than", "that", "few", "that'll", "ve", "with", "couldn", "itself", "down", "it's", "where", "such", "isn't", "didn", "again", "will", "when", "through", "him", "mustn", "won", "doesn", "o", "herself", "weren't", "themselves", "which", "don", "wasn't", "below", "most", "re", "what", "your", "ours", "you're", "she", "his", "himself", "up", "yourselves", "should've", "this", "is", "you've", "d", "very", "these", "more", "off", "doing", "hers", "and", "both", "how", "aren't", "do", "needn't", "it", "don't", "no", "haven", "were", "by", "hasn", "during", "wouldn't", "aren", "had", "should", "has", "been", "other", "ll", "theirs", "why", "so", "doesn't", "an", "hadn't", "nor", "was", "there", "now", "they", "we"]

tests.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from handler import EndpointHandler
+# init handler
+response_model_handler = EndpointHandler()
+# prepare sample payload
+payload = {"instruction": "What are some ways to stay energized throughout the day?",
+           "response": "Drink lots of coffee!"}
+# test the handler
+pred=response_model_handler(payload)
+print(pred)