Spaces:

mahynski
/

RAG

Sleeping

mahynski commited on Aug 8, 2024

Commit

f5ca364

verified ·

1 Parent(s): 5b673e1

manual update for mistralai tokens

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,7 +21,11 @@ class MistralTokens:
     """
     def __init__(self, llm_name):
         from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-        self.tokenizer = MistralTokenizer.from_model(llm_name)
     def __call__(self, input):
         """This returns all the tokens indices in a list since LlamaIndex seems to count by calling `len()` on the tokenizer function."""
@@ -71,7 +75,7 @@ def main():
         elif provider == 'huggingface':
             llm_list = []
         elif provider == 'mistralai':
-            llm_list = ["mistral-small-latest", "mistral-medium-latest", "mistral-large-latest", "open-mistral-nemo-latest"]
         elif provider == 'openai':
             llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
         else:

     """
     def __init__(self, llm_name):
         from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+        if 'open-mistral-nemo' in llm_name:
+            self.tokenizer = MistralTokenizer.v3(is_tekken=True)
+        else:
+            # This might work for all models, but their documentation is unclear.
+            self.tokenizer = MistralTokenizer.from_model(llm_name)
     def __call__(self, input):
         """This returns all the tokens indices in a list since LlamaIndex seems to count by calling `len()` on the tokenizer function."""
         elif provider == 'huggingface':
             llm_list = []
         elif provider == 'mistralai':
+            llm_list = ["mistral-small-latest", "mistral-large-latest", "open-mistral-nemo-latest"]
         elif provider == 'openai':
             llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
         else: