hustcw
/

clap-text

Feature Extraction

Transformers

Safetensors

mpnet

custom_code

Model card Files Files and versions Community

hustcw commited on Feb 29, 2024

Commit

4e6097c

1 Parent(s): d2f441b

update modeling

Browse files

Files changed (1) hide show

clap_modeling.py +29 -1

clap_modeling.py CHANGED Viewed

@@ -66,7 +66,7 @@ class AsmTokenizer(MPNetTokenizerFast):
                 tokenized_functions['instr'] = tokenized_functions['instr'][:self.model_max_length]
                 break
         return tokenized_functions
     def encode_function(self, function):
         tokenized_functions = self.tokenize_function(function)
         token_ids = self.convert_tokens_to_ids(tokenized_functions["token"])
@@ -76,6 +76,34 @@ class AsmTokenizer(MPNetTokenizerFast):
             "attention_mask": [1] * len(token_ids),
             "token_type_ids": instr_ids,
         })
     @property
     def vocab_size(self) -> int:

                 tokenized_functions['instr'] = tokenized_functions['instr'][:self.model_max_length]
                 break
         return tokenized_functions
     def encode_function(self, function):
         tokenized_functions = self.tokenize_function(function)
         token_ids = self.convert_tokens_to_ids(tokenized_functions["token"])
             "attention_mask": [1] * len(token_ids),
             "token_type_ids": instr_ids,
         })
+    def __call__(self, functions, **kwargs):
+        if len(functions) == 0:
+            return BatchEncoding({
+                "input_ids": [],
+                "attention_mask": [],
+                "token_type_ids": [],
+            })
+        if not isinstance(functions, list):
+            raise ValueError("functions must be a list of dict")
+        elif not isinstance(functions[0], dict):
+            raise ValueError("functions must be a list of dict")
+        else:
+            batch_encode_result = {
+                "input_ids": [],
+                "attention_mask": [],
+                "token_type_ids": [],
+            }
+            for function in functions:
+                tokenized_functions = self.tokenize_function(function)
+                token_ids = self.convert_tokens_to_ids(tokenized_functions["token"])
+                instr_ids = self.convert_tokens_to_ids(tokenized_functions["instr"])
+                attention_mask = [1] * len(token_ids)
+                batch_encode_result["input_ids"].append(token_ids)
+                batch_encode_result["attention_mask"].append(attention_mask)
+                batch_encode_result["token_type_ids"].append(instr_ids)
+            batch_encoding = BatchEncoding(batch_encode_result)
+            return self.pad(batch_encoding, **kwargs)
     @property
     def vocab_size(self) -> int: