AurelioAguirre commited on
Commit
c755479
·
1 Parent(s): 2c44633

Removed api prefix

Browse files
Files changed (2) hide show
  1. main/api.py +53 -53
  2. main/resources/config.yaml +1 -1
main/api.py CHANGED
@@ -80,62 +80,62 @@ class LLMApi:
80
  self.logger.error(f"Failed to download model {model_name}: {str(e)}")
81
  raise
82
 
83
- def initialize_model(self, model_name: str) -> None:
84
- """
85
- Initialize a model and tokenizer for text generation.
86
- Handles different platforms (CUDA, MPS, CPU) appropriately.
87
- """
88
- self.logger.info(f"Initializing generation model: {model_name}")
89
- try:
90
- self.generation_model_name = model_name
91
- local_model_path = self.models_path / model_name.split('/')[-1]
92
-
93
- # Check if model exists locally
94
- if local_model_path.exists():
95
- self.logger.info(f"Loading model from local path: {local_model_path}")
96
- model_path = local_model_path
97
- else:
98
- self.logger.info(f"Loading model from source: {model_name}")
99
- model_path = model_name
100
-
101
- # Check platform and set appropriate configuration
102
- if torch.cuda.is_available():
103
- self.logger.info("CUDA detected, using GPU with quantization")
104
- quantization_config = BitsAndBytesConfig(
105
- load_in_8bit=True,
106
- llm_int8_threshold=3.0
107
- )
108
- self.generation_model = AutoModelForCausalLM.from_pretrained(
109
- model_path,
110
- device_map="auto",
111
- quantization_config=quantization_config,
112
- torch_dtype=torch.float16
113
- )
114
- elif torch.backends.mps.is_available():
115
- self.logger.info("Apple Silicon detected, using MPS device")
116
- self.generation_model = AutoModelForCausalLM.from_pretrained(
117
- model_path,
118
- device_map="mps",
119
- torch_dtype=torch.float16
120
- )
121
- else:
122
- self.logger.info("No GPU detected, falling back to CPU")
123
- self.generation_model = AutoModelForCausalLM.from_pretrained(
124
- model_path,
125
- device_map="cpu",
126
- torch_dtype=torch.float32 # Use full precision for CPU
127
- )
128
 
129
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- # Update generation config with tokenizer-specific values
132
- self.generation_config["eos_token_id"] = self.tokenizer.eos_token_id
133
- self.generation_config["pad_token_id"] = self.tokenizer.eos_token_id
134
 
135
- self.logger.info(f"Successfully initialized generation model: {model_name}")
136
- except Exception as e:
137
- self.logger.error(f"Failed to initialize generation model {model_name}: {str(e)}")
138
- raise
 
 
 
 
139
 
140
  def initialize_embedding_model(self, model_name: str) -> None:
141
  """
 
80
  self.logger.error(f"Failed to download model {model_name}: {str(e)}")
81
  raise
82
 
83
+ def initialize_model(self, model_name: str) -> None:
84
+ """
85
+ Initialize a model and tokenizer for text generation.
86
+ Handles different platforms (CUDA, MPS, CPU) appropriately.
87
+ """
88
+ self.logger.info(f"Initializing generation model: {model_name}")
89
+ try:
90
+ self.generation_model_name = model_name
91
+ local_model_path = self.models_path / model_name.split('/')[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ # Check if model exists locally
94
+ if local_model_path.exists():
95
+ self.logger.info(f"Loading model from local path: {local_model_path}")
96
+ model_path = local_model_path
97
+ else:
98
+ self.logger.info(f"Loading model from source: {model_name}")
99
+ model_path = model_name
100
+
101
+ # Check platform and set appropriate configuration
102
+ if torch.cuda.is_available():
103
+ self.logger.info("CUDA detected, using GPU with quantization")
104
+ quantization_config = BitsAndBytesConfig(
105
+ load_in_8bit=True,
106
+ llm_int8_threshold=3.0
107
+ )
108
+ self.generation_model = AutoModelForCausalLM.from_pretrained(
109
+ model_path,
110
+ device_map="auto",
111
+ quantization_config=quantization_config,
112
+ torch_dtype=torch.float16
113
+ )
114
+ elif torch.backends.mps.is_available():
115
+ self.logger.info("Apple Silicon detected, using MPS device")
116
+ self.generation_model = AutoModelForCausalLM.from_pretrained(
117
+ model_path,
118
+ device_map="mps",
119
+ torch_dtype=torch.float16
120
+ )
121
+ else:
122
+ self.logger.info("No GPU detected, falling back to CPU")
123
+ self.generation_model = AutoModelForCausalLM.from_pretrained(
124
+ model_path,
125
+ device_map="cpu",
126
+ torch_dtype=torch.float32 # Use full precision for CPU
127
+ )
128
 
129
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
130
 
131
+ # Update generation config with tokenizer-specific values
132
+ self.generation_config["eos_token_id"] = self.tokenizer.eos_token_id
133
+ self.generation_config["pad_token_id"] = self.tokenizer.eos_token_id
134
+
135
+ self.logger.info(f"Successfully initialized generation model: {model_name}")
136
+ except Exception as e:
137
+ self.logger.error(f"Failed to initialize generation model {model_name}: {str(e)}")
138
+ raise
139
 
140
  def initialize_embedding_model(self, model_name: str) -> None:
141
  """
main/resources/config.yaml CHANGED
@@ -25,7 +25,7 @@ logging:
25
 
26
  api:
27
  version: "v1"
28
- prefix: "/api"
29
  cors:
30
  origins: ["*"]
31
  credentials: true
 
25
 
26
  api:
27
  version: "v1"
28
+ prefix: ""
29
  cors:
30
  origins: ["*"]
31
  credentials: true