Spaces:

monsterapi
/

Monster-LLMs

Runtime error

File size: 5,725 Bytes

#MonsterAPIClient.py

"""

Monster API Python client to connect to LLM models on monsterapi



Base URL: https://api.monsterapi.ai/v1/generate/{model}



Available models:

-----------------

    1. falcon-7b-instruct

    2. falcon-40b-instruct

    3. mpt-30B-instruct

    4. mpt-7b-instruct

    5. openllama-13b-base

    6. llama2-7b-chat



"""
import os
import time
import logging
import requests
from requests_toolbelt.multipart.encoder import MultipartEncoder

from typing import Optional, Literal, Union, List, Dict
from pydantic import BaseModel, Field

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class InputModel1(BaseModel):
    """

    Supports Following models: Falcon-40B-instruct, Falcon-7B-instruct, openllama-13b-base, llama2-7b-chat



    prompt	string	Prompt is a textual instruction for the model to produce an output.	Required

    top_k	integer	Top-k sampling helps improve quality by removing the tail and making it less likely to go off topic.	Optional

    (Default: 40)

    top_p	float	Top-p sampling helps generate more diverse and creative text by considering a broader range of tokens.	Optional

    (Default: 1.0)

    temp	float	The temperature influences the randomness of the next token predictions.	Optional

    (Default: 0.98)

    max_length	integer	The maximum length of the generated text.	Optional

    (Default: 256)

    repetition_penalty	float	The model uses this penalty to discourage the repetition of tokens in the output.	Optional

    (Default: 1.2)

    beam_size	integer	The beam size for beam search. A larger beam size results in better quality output, but slower generation times.	Optional

    (Default: 1)    

    """
    prompt: str
    top_k: int = 40
    top_p: float = Field(0.9, ge=0., le=1.)
    temp: float = Field(0.98, ge=0., le=1.)
    max_length: int = 256
    repetition_penalty: float = 1.2
    beam_size: int = 1


class InputModel2(BaseModel):
    """

    Supports Following models: MPT-30B-instruct, MPT-7B-instruct



    prompt:	string	Instruction is a textual command for the model to produce an output.	Required

    top_k	integer	Top-k sampling helps improve quality by removing the tail and making it less likely to go off topic.	Optional

    (Default: 40)

    top_p	float	Top-p sampling helps generate more diverse and creative text by considering a broader range of tokens.	Optional

    Allowed Range: 0 - 1

    (Default: 1.0)

    temp	float	Temperature is a parameter that controls the randomness of the model's output. The higher the temperature, the more random the output.	Optional

    (Default: 0.98)

    max_length	integer	Maximum length of the generated output.	Optional

    (Default: 256)

    """
    prompt: str
    top_k: int = 40
    top_p: float = Field(0.9, ge=0., le=1.)
    temp: float = Field(0.98, ge=0., le=1.)
    max_length: int = 256

MODELS_TO_DATAMODEL = {
            'falcon-7b-instruct': InputModel1,
            'falcon-40b-instruct': InputModel1,
            'mpt-30B-instruct': InputModel2,
            'mpt-7b-instruct': InputModel2,
            'openllama-13b-base': InputModel1,
            'llama2-7b-chat': InputModel1
        }


class MClient():
    def __init__(self):
        self.boundary = '---011000010111000001101001'
        self.auth_token = os.environ.get('MONSTER_API_KEY')
        self.headers = {
            "accept": "application/json",
            "content-type": f"multipart/form-data; boundary={self.boundary}",
            'Authorization': 'Bearer ' + self.auth_token}
        self.base_url = 'https://api.monsterapi.ai/v1'
        self.models_to_data_model = MODELS_TO_DATAMODEL
        self.mock = os.environ.get('MOCK_Runner', "True").lower() == "true"

    def get_response(self, model:Literal['falcon-20b-instruct', 'falcon-7b-instruct', 'mpt-30B-instruct', 'mpt-7B-instruct'], 

                     data: dict):
    
        if model not in self.models_to_data_model:
            raise ValueError(f"Invalid model: {model}!")

        dataModel = self.models_to_data_model[model](**data)
        url = f"{self.base_url}/generate/{model}"
        #url = self.base_url + model
        data = dataModel.dict()
        # convert all values into string
        for key, value in data.items():
            data[key] = str(value)
        multipart_data = MultipartEncoder(fields=data, boundary=self.boundary)
        response = requests.post(url, headers=self.headers, data=multipart_data)
        response.raise_for_status()
        return response.json()
    
    def get_status(self, process_id):
        # /v1/status/{process_id}
        url = f"{self.base_url}/status/{process_id}"
        response = requests.get(url, headers=self.headers)
        response.raise_for_status()
        return response.json()
    
    def wait_and_get_result(self, process_id):
        while True:
            status = self.get_status(process_id)
            if status['status'].lower() == 'completed':
                return status['result']
            elif status['status'].lower() == 'failed':
                raise RuntimeError(f"Process {process_id} failed!")
            else:
                if self.mock:
                    return 100*"Mock Output!" 
                logger.info(f"Process {process_id} is still running, status is {status['status']}. Waiting for 5 seconds...")
                time.sleep(1)

        
if __name__ == '__main__':
    client = MClient()
    response = client.get_response('falcon-7b-instruct', {"prompt": 'How to make a sandwich'})
    output = client.wait_and_get_result(response['process_id'])
    print(output)