Alex Chen commited on
Commit
5294086
·
1 Parent(s): 83ee116

Feat: add gpustack model provider (#4469)

Browse files

### What problem does this PR solve?

Add GPUStack as a new model provider.
[GPUStack](https://github.com/gpustack/gpustack) is an open-source GPU
cluster manager for running LLMs. Currently, locally deployed models in
GPUStack cannot integrate well with RAGFlow. GPUStack provides both
OpenAI compatible APIs (Models / Chat Completions / Embeddings /
Speech2Text / TTS) and other APIs like Rerank. We would like to use
GPUStack as a model provider in ragflow.

[GPUStack Docs](https://docs.gpustack.ai/latest/quickstart/)

Related issue: https://github.com/infiniflow/ragflow/issues/4064.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)



### Testing Instructions
1. Install GPUStack and deploy the `llama-3.2-1b-instruct` llm, `bge-m3`
text embedding model, `bge-reranker-v2-m3` rerank model,
`faster-whisper-medium` Speech-to-Text model, `cosyvoice-300m-sft` in
GPUStack.
2. Add provider in ragflow settings.
3. Testing in ragflow.

api/apps/llm_app.py CHANGED
@@ -329,7 +329,7 @@ def my_llms():
329
  @manager.route('/list', methods=['GET']) # noqa: F821
330
  @login_required
331
  def list_app():
332
- self_deploied = ["Youdao", "FastEmbed", "BAAI", "Ollama", "Xinference", "LocalAI", "LM-Studio"]
333
  weighted = ["Youdao", "FastEmbed", "BAAI"] if settings.LIGHTEN != 0 else []
334
  model_type = request.args.get("model_type")
335
  try:
@@ -339,7 +339,7 @@ def list_app():
339
  llms = [m.to_dict()
340
  for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted]
341
  for m in llms:
342
- m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in self_deploied
343
 
344
  llm_set = set([m["llm_name"] + "@" + m["fid"] for m in llms])
345
  for o in objs:
 
329
  @manager.route('/list', methods=['GET']) # noqa: F821
330
  @login_required
331
  def list_app():
332
+ self_deployed = ["Youdao", "FastEmbed", "BAAI", "Ollama", "Xinference", "LocalAI", "LM-Studio", "GPUStack"]
333
  weighted = ["Youdao", "FastEmbed", "BAAI"] if settings.LIGHTEN != 0 else []
334
  model_type = request.args.get("model_type")
335
  try:
 
339
  llms = [m.to_dict()
340
  for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted]
341
  for m in llms:
342
+ m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in self_deployed
343
 
344
  llm_set = set([m["llm_name"] + "@" + m["fid"] for m in llms])
345
  for o in objs:
conf/llm_factories.json CHANGED
@@ -2543,6 +2543,13 @@
2543
  "tags": "TEXT EMBEDDING",
2544
  "status": "1",
2545
  "llm": []
 
 
 
 
 
 
 
2546
  }
2547
  ]
2548
  }
 
2543
  "tags": "TEXT EMBEDDING",
2544
  "status": "1",
2545
  "llm": []
2546
+ },
2547
+ {
2548
+ "name": "GPUStack",
2549
+ "logo": "",
2550
+ "tags": "LLM,TEXT EMBEDDING,TTS,SPEECH2TEXT,TEXT RE-RANK",
2551
+ "status": "1",
2552
+ "llm": []
2553
  }
2554
  ]
2555
  }
rag/llm/__init__.py CHANGED
@@ -42,6 +42,7 @@ from .embedding_model import (
42
  VoyageEmbed,
43
  HuggingFaceEmbed,
44
  VolcEngineEmbed,
 
45
  )
46
  from .chat_model import (
47
  GptTurbo,
@@ -80,6 +81,7 @@ from .chat_model import (
80
  AnthropicChat,
81
  GoogleChat,
82
  HuggingFaceChat,
 
83
  )
84
 
85
  from .cv_model import (
@@ -116,6 +118,7 @@ from .rerank_model import (
116
  BaiduYiyanRerank,
117
  VoyageRerank,
118
  QWenRerank,
 
119
  )
120
  from .sequence2txt_model import (
121
  GPTSeq2txt,
@@ -123,6 +126,7 @@ from .sequence2txt_model import (
123
  AzureSeq2txt,
124
  XinferenceSeq2txt,
125
  TencentCloudSeq2txt,
 
126
  )
127
  from .tts_model import (
128
  FishAudioTTS,
@@ -130,6 +134,7 @@ from .tts_model import (
130
  OpenAITTS,
131
  SparkTTS,
132
  XinferenceTTS,
 
133
  )
134
 
135
  EmbeddingModel = {
@@ -161,6 +166,7 @@ EmbeddingModel = {
161
  "Voyage AI": VoyageEmbed,
162
  "HuggingFace": HuggingFaceEmbed,
163
  "VolcEngine": VolcEngineEmbed,
 
164
  }
165
 
166
  CvModel = {
@@ -220,6 +226,7 @@ ChatModel = {
220
  "Anthropic": AnthropicChat,
221
  "Google Cloud": GoogleChat,
222
  "HuggingFace": HuggingFaceChat,
 
223
  }
224
 
225
  RerankModel = {
@@ -237,6 +244,7 @@ RerankModel = {
237
  "BaiduYiyan": BaiduYiyanRerank,
238
  "Voyage AI": VoyageRerank,
239
  "Tongyi-Qianwen": QWenRerank,
 
240
  }
241
 
242
  Seq2txtModel = {
@@ -245,6 +253,7 @@ Seq2txtModel = {
245
  "Azure-OpenAI": AzureSeq2txt,
246
  "Xinference": XinferenceSeq2txt,
247
  "Tencent Cloud": TencentCloudSeq2txt,
 
248
  }
249
 
250
  TTSModel = {
@@ -253,4 +262,5 @@ TTSModel = {
253
  "OpenAI": OpenAITTS,
254
  "XunFei Spark": SparkTTS,
255
  "Xinference": XinferenceTTS,
 
256
  }
 
42
  VoyageEmbed,
43
  HuggingFaceEmbed,
44
  VolcEngineEmbed,
45
+ GPUStackEmbed,
46
  )
47
  from .chat_model import (
48
  GptTurbo,
 
81
  AnthropicChat,
82
  GoogleChat,
83
  HuggingFaceChat,
84
+ GPUStackChat,
85
  )
86
 
87
  from .cv_model import (
 
118
  BaiduYiyanRerank,
119
  VoyageRerank,
120
  QWenRerank,
121
+ GPUStackRerank,
122
  )
123
  from .sequence2txt_model import (
124
  GPTSeq2txt,
 
126
  AzureSeq2txt,
127
  XinferenceSeq2txt,
128
  TencentCloudSeq2txt,
129
+ GPUStackSeq2txt,
130
  )
131
  from .tts_model import (
132
  FishAudioTTS,
 
134
  OpenAITTS,
135
  SparkTTS,
136
  XinferenceTTS,
137
+ GPUStackTTS,
138
  )
139
 
140
  EmbeddingModel = {
 
166
  "Voyage AI": VoyageEmbed,
167
  "HuggingFace": HuggingFaceEmbed,
168
  "VolcEngine": VolcEngineEmbed,
169
+ "GPUStack": GPUStackEmbed,
170
  }
171
 
172
  CvModel = {
 
226
  "Anthropic": AnthropicChat,
227
  "Google Cloud": GoogleChat,
228
  "HuggingFace": HuggingFaceChat,
229
+ "GPUStack": GPUStackChat,
230
  }
231
 
232
  RerankModel = {
 
244
  "BaiduYiyan": BaiduYiyanRerank,
245
  "Voyage AI": VoyageRerank,
246
  "Tongyi-Qianwen": QWenRerank,
247
+ "GPUStack": GPUStackRerank,
248
  }
249
 
250
  Seq2txtModel = {
 
253
  "Azure-OpenAI": AzureSeq2txt,
254
  "Xinference": XinferenceSeq2txt,
255
  "Tencent Cloud": TencentCloudSeq2txt,
256
+ "GPUStack": GPUStackSeq2txt,
257
  }
258
 
259
  TTSModel = {
 
262
  "OpenAI": OpenAITTS,
263
  "XunFei Spark": SparkTTS,
264
  "Xinference": XinferenceTTS,
265
+ "GPUStack": GPUStackTTS,
266
  }
rag/llm/chat_model.py CHANGED
@@ -1514,3 +1514,11 @@ class GoogleChat(Base):
1514
  yield ans + "\n**ERROR**: " + str(e)
1515
 
1516
  yield response._chunks[-1].usage_metadata.total_token_count
 
 
 
 
 
 
 
 
 
1514
  yield ans + "\n**ERROR**: " + str(e)
1515
 
1516
  yield response._chunks[-1].usage_metadata.total_token_count
1517
+
1518
+ class GPUStackChat(Base):
1519
+ def __init__(self, key=None, model_name="", base_url=""):
1520
+ if not base_url:
1521
+ raise ValueError("Local llm url cannot be None")
1522
+ if base_url.split("/")[-1] != "v1-openai":
1523
+ base_url = os.path.join(base_url, "v1-openai")
1524
+ super().__init__(key, model_name, base_url)
rag/llm/embedding_model.py CHANGED
@@ -30,7 +30,7 @@ import asyncio
30
  from api import settings
31
  from api.utils.file_utils import get_home_cache_dir
32
  from rag.utils import num_tokens_from_string, truncate
33
- import google.generativeai as genai
34
  import json
35
 
36
 
@@ -799,3 +799,14 @@ class VolcEngineEmbed(OpenAIEmbed):
799
  ark_api_key = json.loads(key).get('ark_api_key', '')
800
  model_name = json.loads(key).get('ep_id', '') + json.loads(key).get('endpoint_id', '')
801
  super().__init__(ark_api_key,model_name,base_url)
 
 
 
 
 
 
 
 
 
 
 
 
30
  from api import settings
31
  from api.utils.file_utils import get_home_cache_dir
32
  from rag.utils import num_tokens_from_string, truncate
33
+ import google.generativeai as genai
34
  import json
35
 
36
 
 
799
  ark_api_key = json.loads(key).get('ark_api_key', '')
800
  model_name = json.loads(key).get('ep_id', '') + json.loads(key).get('endpoint_id', '')
801
  super().__init__(ark_api_key,model_name,base_url)
802
+
803
+ class GPUStackEmbed(OpenAIEmbed):
804
+ def __init__(self, key, model_name, base_url):
805
+ if not base_url:
806
+ raise ValueError("url cannot be None")
807
+ if base_url.split("/")[-1] != "v1-openai":
808
+ base_url = os.path.join(base_url, "v1-openai")
809
+
810
+ print(key,base_url)
811
+ self.client = OpenAI(api_key=key, base_url=base_url)
812
+ self.model_name = model_name
rag/llm/rerank_model.py CHANGED
@@ -18,10 +18,12 @@ import threading
18
  from urllib.parse import urljoin
19
 
20
  import requests
 
21
  from huggingface_hub import snapshot_download
22
  import os
23
  from abc import ABC
24
  import numpy as np
 
25
 
26
  from api import settings
27
  from api.utils.file_utils import get_home_cache_dir
@@ -457,3 +459,53 @@ class QWenRerank(Base):
457
  return rank, resp.usage.total_tokens
458
  else:
459
  raise ValueError(f"Error calling QWenRerank model {self.model_name}: {resp.status_code} - {resp.text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  from urllib.parse import urljoin
19
 
20
  import requests
21
+ import httpx
22
  from huggingface_hub import snapshot_download
23
  import os
24
  from abc import ABC
25
  import numpy as np
26
+ from yarl import URL
27
 
28
  from api import settings
29
  from api.utils.file_utils import get_home_cache_dir
 
459
  return rank, resp.usage.total_tokens
460
  else:
461
  raise ValueError(f"Error calling QWenRerank model {self.model_name}: {resp.status_code} - {resp.text}")
462
+
463
+ class GPUStackRerank(Base):
464
+ def __init__(
465
+ self, key, model_name, base_url
466
+ ):
467
+ if not base_url:
468
+ raise ValueError("url cannot be None")
469
+
470
+ self.model_name = model_name
471
+ self.base_url = str(URL(base_url)/ "v1" / "rerank")
472
+ self.headers = {
473
+ "accept": "application/json",
474
+ "content-type": "application/json",
475
+ "authorization": f"Bearer {key}",
476
+ }
477
+
478
+ def similarity(self, query: str, texts: list):
479
+ payload = {
480
+ "model": self.model_name,
481
+ "query": query,
482
+ "documents": texts,
483
+ "top_n": len(texts),
484
+ }
485
+
486
+ try:
487
+ response = requests.post(
488
+ self.base_url, json=payload, headers=self.headers
489
+ )
490
+ response.raise_for_status()
491
+ response_json = response.json()
492
+
493
+ rank = np.zeros(len(texts), dtype=float)
494
+ if "results" not in response_json:
495
+ return rank, 0
496
+
497
+ token_count = 0
498
+ for t in texts:
499
+ token_count += num_tokens_from_string(t)
500
+
501
+ for result in response_json["results"]:
502
+ rank[result["index"]] = result["relevance_score"]
503
+
504
+ return (
505
+ rank,
506
+ token_count,
507
+ )
508
+
509
+ except httpx.HTTPStatusError as e:
510
+ raise ValueError(f"Error calling GPUStackRerank model {self.model_name}: {e.response.status_code} - {e.response.text}")
511
+
rag/llm/sequence2txt_model.py CHANGED
@@ -13,6 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
16
  import requests
17
  from openai.lib.azure import AzureOpenAI
18
  import io
@@ -191,3 +192,14 @@ class TencentCloudSeq2txt(Base):
191
  return "**ERROR**: " + str(e), 0
192
  except Exception as e:
193
  return "**ERROR**: " + str(e), 0
 
 
 
 
 
 
 
 
 
 
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
+ import os
17
  import requests
18
  from openai.lib.azure import AzureOpenAI
19
  import io
 
192
  return "**ERROR**: " + str(e), 0
193
  except Exception as e:
194
  return "**ERROR**: " + str(e), 0
195
+
196
+
197
+ class GPUStackSeq2txt(Base):
198
+ def __init__(self, key, model_name, base_url):
199
+ if not base_url:
200
+ raise ValueError("url cannot be None")
201
+ if base_url.split("/")[-1] != "v1-openai":
202
+ base_url = os.path.join(base_url, "v1-openai")
203
+ self.base_url = base_url
204
+ self.model_name = model_name
205
+ self.key = key
rag/llm/tts_model.py CHANGED
@@ -355,3 +355,35 @@ class OllamaTTS(Base):
355
  for chunk in response.iter_content():
356
  if chunk:
357
  yield chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  for chunk in response.iter_content():
356
  if chunk:
357
  yield chunk
358
+
359
+ class GPUStackTTS:
360
+ def __init__(self, key, model_name, **kwargs):
361
+ self.base_url = kwargs.get("base_url", None)
362
+ self.api_key = key
363
+ self.model_name = model_name
364
+ self.headers = {
365
+ "accept": "application/json",
366
+ "Content-Type": "application/json",
367
+ "Authorization": f"Bearer {self.api_key}"
368
+ }
369
+
370
+ def tts(self, text, voice="Chinese Female", stream=True):
371
+ payload = {
372
+ "model": self.model_name,
373
+ "input": text,
374
+ "voice": voice
375
+ }
376
+
377
+ response = requests.post(
378
+ f"{self.base_url}/v1-openai/audio/speech",
379
+ headers=self.headers,
380
+ json=payload,
381
+ stream=stream
382
+ )
383
+
384
+ if response.status_code != 200:
385
+ raise Exception(f"**Error**: {response.status_code}, {response.text}")
386
+
387
+ for chunk in response.iter_content(chunk_size=1024):
388
+ if chunk:
389
+ yield chunk
web/src/assets/svg/llm/gpustack.svg ADDED
web/src/constants/setting.ts CHANGED
@@ -72,6 +72,7 @@ export const IconMap = {
72
  'nomic-ai': 'nomic-ai',
73
  jinaai: 'jina',
74
  'sentence-transformers': 'sentence-transformers',
 
75
  };
76
 
77
  export const TimezoneList = [
 
72
  'nomic-ai': 'nomic-ai',
73
  jinaai: 'jina',
74
  'sentence-transformers': 'sentence-transformers',
75
+ GPUStack: 'gpustack',
76
  };
77
 
78
  export const TimezoneList = [
web/src/pages/user-setting/constants.tsx CHANGED
@@ -31,6 +31,7 @@ export const LocalLlmFactories = [
31
  'Replicate',
32
  'OpenRouter',
33
  'HuggingFace',
 
34
  ];
35
 
36
  export enum TenantRole {
 
31
  'Replicate',
32
  'OpenRouter',
33
  'HuggingFace',
34
+ 'GPUStack',
35
  ];
36
 
37
  export enum TenantRole {
web/src/pages/user-setting/setting-model/ollama-modal/index.tsx CHANGED
@@ -29,6 +29,7 @@ const llmFactoryToUrlMap = {
29
  OpenRouter: 'https://openrouter.ai/docs',
30
  HuggingFace:
31
  'https://huggingface.co/docs/text-embeddings-inference/quick_tour',
 
32
  };
33
  type LlmFactory = keyof typeof llmFactoryToUrlMap;
34
 
@@ -76,6 +77,13 @@ const OllamaModal = ({
76
  { value: 'speech2text', label: 'sequence2text' },
77
  { value: 'tts', label: 'tts' },
78
  ],
 
 
 
 
 
 
 
79
  Default: [
80
  { value: 'chat', label: 'chat' },
81
  { value: 'embedding', label: 'embedding' },
 
29
  OpenRouter: 'https://openrouter.ai/docs',
30
  HuggingFace:
31
  'https://huggingface.co/docs/text-embeddings-inference/quick_tour',
32
+ GPUStack: 'https://docs.gpustack.ai/latest/quickstart',
33
  };
34
  type LlmFactory = keyof typeof llmFactoryToUrlMap;
35
 
 
77
  { value: 'speech2text', label: 'sequence2text' },
78
  { value: 'tts', label: 'tts' },
79
  ],
80
+ GPUStack: [
81
+ { value: 'chat', label: 'chat' },
82
+ { value: 'embedding', label: 'embedding' },
83
+ { value: 'rerank', label: 'rerank' },
84
+ { value: 'speech2text', label: 'sequence2text' },
85
+ { value: 'tts', label: 'tts' },
86
+ ],
87
  Default: [
88
  { value: 'chat', label: 'chat' },
89
  { value: 'embedding', label: 'embedding' },