syntax = "proto3"; option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto"; option java_multiple_files = true; option java_package = "io.skynet.localai.backend"; option java_outer_classname = "LocalAIBackend"; package backend; service Backend { rpc Health(HealthMessage) returns (Reply) {} rpc Predict(PredictOptions) returns (Reply) {} rpc LoadModel(ModelOptions) returns (Result) {} rpc PredictStream(PredictOptions) returns (stream Reply) {} rpc Embedding(PredictOptions) returns (EmbeddingResult) {} rpc GenerateImage(GenerateImageRequest) returns (Result) {} rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {} rpc TTS(TTSRequest) returns (Result) {} rpc SoundGeneration(SoundGenerationRequest) returns (Result) {} rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {} rpc Status(HealthMessage) returns (StatusResponse) {} rpc StoresSet(StoresSetOptions) returns (Result) {} rpc StoresDelete(StoresDeleteOptions) returns (Result) {} rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {} rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {} rpc Rerank(RerankRequest) returns (RerankResult) {} rpc GetMetrics(MetricsRequest) returns (MetricsResponse); } // Define the empty request message MetricsRequest {} message MetricsResponse { int32 slot_id = 1; string prompt_json_for_slot = 2; // Stores the prompt as a JSON string. float tokens_per_second = 3; int32 tokens_generated = 4; int32 prompt_tokens_processed = 5; } message RerankRequest { string query = 1; repeated string documents = 2; int32 top_n = 3; } message RerankResult { Usage usage = 1; repeated DocumentResult results = 2; } message Usage { int32 total_tokens = 1; int32 prompt_tokens = 2; } message DocumentResult { int32 index = 1; string text = 2; float relevance_score = 3; } message StoresKey { repeated float Floats = 1; } message StoresValue { bytes Bytes = 1; } message StoresSetOptions { repeated StoresKey Keys = 1; repeated StoresValue Values = 2; } message StoresDeleteOptions { repeated StoresKey Keys = 1; } message StoresGetOptions { repeated StoresKey Keys = 1; } message StoresGetResult { repeated StoresKey Keys = 1; repeated StoresValue Values = 2; } message StoresFindOptions { StoresKey Key = 1; int32 TopK = 2; } message StoresFindResult { repeated StoresKey Keys = 1; repeated StoresValue Values = 2; repeated float Similarities = 3; } message HealthMessage {} // The request message containing the user's name. message PredictOptions { string Prompt = 1; int32 Seed = 2; int32 Threads = 3; int32 Tokens = 4; int32 TopK = 5; int32 Repeat = 6; int32 Batch = 7; int32 NKeep = 8; float Temperature = 9; float Penalty = 10; bool F16KV = 11; bool DebugMode = 12; repeated string StopPrompts = 13; bool IgnoreEOS = 14; float TailFreeSamplingZ = 15; float TypicalP = 16; float FrequencyPenalty = 17; float PresencePenalty = 18; int32 Mirostat = 19; float MirostatETA = 20; float MirostatTAU = 21; bool PenalizeNL = 22; string LogitBias = 23; bool MLock = 25; bool MMap = 26; bool PromptCacheAll = 27; bool PromptCacheRO = 28; string Grammar = 29; string MainGPU = 30; string TensorSplit = 31; float TopP = 32; string PromptCachePath = 33; bool Debug = 34; repeated int32 EmbeddingTokens = 35; string Embeddings = 36; float RopeFreqBase = 37; float RopeFreqScale = 38; float NegativePromptScale = 39; string NegativePrompt = 40; int32 NDraft = 41; repeated string Images = 42; bool UseTokenizerTemplate = 43; repeated Message Messages = 44; repeated string Videos = 45; repeated string Audios = 46; string CorrelationId = 47; } // The response message containing the result message Reply { bytes message = 1; int32 tokens = 2; int32 prompt_tokens = 3; } message ModelOptions { string Model = 1; int32 ContextSize = 2; int32 Seed = 3; int32 NBatch = 4; bool F16Memory = 5; bool MLock = 6; bool MMap = 7; bool VocabOnly = 8; bool LowVRAM = 9; bool Embeddings = 10; bool NUMA = 11; int32 NGPULayers = 12; string MainGPU = 13; string TensorSplit = 14; int32 Threads = 15; string LibrarySearchPath = 16; float RopeFreqBase = 17; float RopeFreqScale = 18; float RMSNormEps = 19; int32 NGQA = 20; string ModelFile = 21; // AutoGPTQ string Device = 22; bool UseTriton = 23; string ModelBaseName = 24; bool UseFastTokenizer = 25; // Diffusers string PipelineType = 26; string SchedulerType = 27; bool CUDA = 28; float CFGScale = 29; bool IMG2IMG = 30; string CLIPModel = 31; string CLIPSubfolder = 32; int32 CLIPSkip = 33; string ControlNet = 48; string Tokenizer = 34; // LLM (llama.cpp) string LoraBase = 35; string LoraAdapter = 36; float LoraScale = 42; bool NoMulMatQ = 37; string DraftModel = 39; string AudioPath = 38; // vllm string Quantization = 40; float GPUMemoryUtilization = 50; bool TrustRemoteCode = 51; bool EnforceEager = 52; int32 SwapSpace = 53; int32 MaxModelLen = 54; int32 TensorParallelSize = 55; string LoadFormat = 58; string MMProj = 41; string RopeScaling = 43; float YarnExtFactor = 44; float YarnAttnFactor = 45; float YarnBetaFast = 46; float YarnBetaSlow = 47; string Type = 49; bool FlashAttention = 56; bool NoKVOffload = 57; string ModelPath = 59; repeated string LoraAdapters = 60; repeated float LoraScales = 61; } message Result { string message = 1; bool success = 2; } message EmbeddingResult { repeated float embeddings = 1; } message TranscriptRequest { string dst = 2; string language = 3; uint32 threads = 4; bool translate = 5; } message TranscriptResult { repeated TranscriptSegment segments = 1; string text = 2; } message TranscriptSegment { int32 id = 1; int64 start = 2; int64 end = 3; string text = 4; repeated int32 tokens = 5; } message GenerateImageRequest { int32 height = 1; int32 width = 2; int32 mode = 3; int32 step = 4; int32 seed = 5; string positive_prompt = 6; string negative_prompt = 7; string dst = 8; string src = 9; // Diffusers string EnableParameters = 10; int32 CLIPSkip = 11; } message TTSRequest { string text = 1; string model = 2; string dst = 3; string voice = 4; optional string language = 5; } message SoundGenerationRequest { string text = 1; string model = 2; string dst = 3; optional float duration = 4; optional float temperature = 5; optional bool sample = 6; optional string src = 7; optional int32 src_divisor = 8; } message TokenizationResponse { int32 length = 1; repeated int32 tokens = 2; } message MemoryUsageData { uint64 total = 1; map breakdown = 2; } message StatusResponse { enum State { UNINITIALIZED = 0; BUSY = 1; READY = 2; ERROR = -1; } State state = 1; MemoryUsageData memory = 2; } message Message { string role = 1; string content = 2; }