AurelioAguirre commited on
Commit
d828ce4
·
1 Parent(s): b3cf4b4

First commit

Browse files
Files changed (6) hide show
  1. dockerfile +43 -0
  2. main/__init__.py +0 -0
  3. main/api.py +0 -0
  4. main/main.py +179 -0
  5. requirements.txt +7 -0
  6. setup_project.py +48 -0
dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use NVIDIA CUDA base image
2
+ FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 as base
3
+
4
+ # Set working directory to /code (Hugging Face Spaces convention)
5
+ WORKDIR /code
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ python3.10 \
10
+ python3-pip \
11
+ git \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Install Python packages
15
+ COPY requirements.txt .
16
+ RUN pip3 install --no-cache-dir -r requirements.txt
17
+
18
+ # Install any additional dependencies needed for litgpt
19
+ RUN pip3 install --no-cache-dir \
20
+ einops \
21
+ xformers \
22
+ bitsandbytes \
23
+ accelerate \
24
+ sentencepiece
25
+
26
+ # Copy the application code
27
+ COPY . .
28
+
29
+ # Create model directory structure
30
+ RUN mkdir -p /code/checkout/meta \
31
+ /code/checkout/microsoft \
32
+ /code/checkout/mistralai
33
+
34
+ # Set environment variables
35
+ ENV PYTHONPATH=/code
36
+ ENV LLM_ENGINE_HOST=0.0.0.0
37
+ ENV LLM_ENGINE_PORT=8001
38
+
39
+ # Expose the port the app runs on
40
+ EXPOSE 8001
41
+
42
+ # Command to run the application
43
+ CMD ["python3", "main.py"]
main/__init__.py ADDED
File without changes
main/api.py ADDED
File without changes
main/main.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import Optional, Dict, Any, Union
4
+ import torch
5
+ import logging
6
+ from pathlib import Path
7
+ from litgpt.api import LLM
8
+ import os
9
+ import uvicorn
10
+
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ app = FastAPI(title="LLM Engine Service")
16
+
17
+ # Global variable to store the LLM instance
18
+ llm_instance = None
19
+
20
+ class InitializeRequest(BaseModel):
21
+ """
22
+ Configuration for model initialization including model path
23
+ """
24
+ mode: str = "cpu"
25
+ precision: Optional[str] = None
26
+ quantize: Optional[str] = None
27
+ gpu_count: Union[str, int] = "auto"
28
+ model_path: str
29
+
30
+ class GenerateRequest(BaseModel):
31
+ prompt: str
32
+ max_new_tokens: int = 50
33
+ temperature: float = 1.0
34
+ top_k: Optional[int] = None
35
+ top_p: float = 1.0
36
+ return_as_token_ids: bool = False
37
+ stream: bool = False
38
+
39
+ @app.post("/initialize")
40
+ async def initialize_model(request: InitializeRequest):
41
+ """
42
+ Initialize the LLM model with specified configuration.
43
+ """
44
+ global llm_instance
45
+
46
+ try:
47
+ if request.precision is None and request.quantize is None:
48
+ # Use auto distribution from load when no specific precision or quantization is set
49
+ llm_instance = LLM.load(
50
+ model=request.model_path,
51
+ distribute="auto" # Let the load function handle distribution automatically
52
+ )
53
+
54
+ logger.info(
55
+ f"Model initialized with auto settings:\n"
56
+ f"Model Path: {request.model_path}\n"
57
+ f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
58
+ f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
59
+ )
60
+ else:
61
+ # Original initialization path for when specific settings are requested
62
+ llm_instance = LLM.load(
63
+ model=request.model_path,
64
+ distribute=None # We'll distribute manually
65
+ )
66
+
67
+ # Distribute the model according to the configuration
68
+ llm_instance.distribute(
69
+ accelerator="cuda" if request.mode == "gpu" else "cpu",
70
+ devices=request.gpu_count,
71
+ precision=request.precision,
72
+ quantize=request.quantize
73
+ )
74
+
75
+ logger.info(
76
+ f"Model initialized successfully with config:\n"
77
+ f"Mode: {request.mode}\n"
78
+ f"Precision: {request.precision}\n"
79
+ f"Quantize: {request.quantize}\n"
80
+ f"GPU Count: {request.gpu_count}\n"
81
+ f"Model Path: {request.model_path}\n"
82
+ f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
83
+ f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
84
+ )
85
+
86
+ return {"success": True, "message": "Model initialized successfully"}
87
+
88
+ except Exception as e:
89
+ logger.error(f"Error initializing model: {str(e)}")
90
+ # Print detailed memory statistics on failure
91
+ logger.error(f"GPU Memory Stats:\n"
92
+ f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
93
+ f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
94
+ f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
95
+ raise HTTPException(status_code=500, detail=f"Error initializing model: {str(e)}")
96
+
97
+ @app.post("/generate")
98
+ async def generate(request: GenerateRequest):
99
+ """
100
+ Generate text using the initialized model.
101
+ """
102
+ global llm_instance
103
+
104
+ if llm_instance is None:
105
+ raise HTTPException(status_code=400, detail="Model not initialized. Call /initialize first.")
106
+
107
+ try:
108
+ if request.stream:
109
+ # For streaming responses, we need to handle differently
110
+ # This is a placeholder as the actual streaming implementation
111
+ # would need to use StreamingResponse from FastAPI
112
+ raise HTTPException(
113
+ status_code=400,
114
+ detail="Streaming is not currently supported through the API"
115
+ )
116
+
117
+ generated_text = llm_instance.generate(
118
+ prompt=request.prompt,
119
+ max_new_tokens=request.max_new_tokens,
120
+ temperature=request.temperature,
121
+ top_k=request.top_k,
122
+ top_p=request.top_p,
123
+ return_as_token_ids=request.return_as_token_ids,
124
+ stream=False # Force stream to False for now
125
+ )
126
+
127
+ response = {
128
+ "generated_text": generated_text if not request.return_as_token_ids else generated_text.tolist(),
129
+ "metadata": {
130
+ "prompt": request.prompt,
131
+ "max_new_tokens": request.max_new_tokens,
132
+ "temperature": request.temperature,
133
+ "top_k": request.top_k,
134
+ "top_p": request.top_p
135
+ }
136
+ }
137
+
138
+ return response
139
+
140
+ except Exception as e:
141
+ logger.error(f"Error generating text: {str(e)}")
142
+ raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
143
+
144
+ @app.get("/health")
145
+ async def health_check():
146
+ """
147
+ Check if the service is running and model is loaded.
148
+ """
149
+ global llm_instance
150
+
151
+ status = {
152
+ "status": "healthy",
153
+ "model_loaded": llm_instance is not None,
154
+ }
155
+
156
+ if llm_instance is not None:
157
+ status["model_info"] = {
158
+ "model_path": llm_instance.config.name,
159
+ "device": str(next(llm_instance.model.parameters()).device)
160
+ }
161
+
162
+ return status
163
+
164
+ def main():
165
+ # Load environment variables or configuration here
166
+ host = os.getenv("LLM_ENGINE_HOST", "0.0.0.0")
167
+ port = int(os.getenv("LLM_ENGINE_PORT", "8001"))
168
+
169
+ # Start the server
170
+ uvicorn.run(
171
+ app,
172
+ host=host,
173
+ port=port,
174
+ log_level="info",
175
+ reload=False
176
+ )
177
+
178
+ if __name__ == "__main__":
179
+ main()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn==0.27.0
3
+ pydantic==2.5.3
4
+ torch==2.5.0
5
+ transformers==4.36.2
6
+ litgpt[all]
7
+ python-dotenv==1.0.0
setup_project.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ import venv
5
+ from pathlib import Path
6
+
7
+ def setup_project():
8
+ # Ensure we're in the right directory
9
+ project_dir = Path(__file__).parent.absolute()
10
+ os.chdir(project_dir)
11
+
12
+ print("Setting up the project...")
13
+
14
+ # Create virtual environment if it doesn't exist
15
+ venv_dir = project_dir / "myenv"
16
+ if not venv_dir.exists():
17
+ print("Creating virtual environment...")
18
+ venv.create(venv_dir, with_pip=True)
19
+
20
+ # Determine the path to the Python executable in the virtual environment
21
+ if sys.platform == "win32":
22
+ python_executable = venv_dir / "Scripts" / "python.exe"
23
+ pip_executable = venv_dir / "Scripts" / "pip.exe"
24
+ else:
25
+ python_executable = venv_dir / "bin" / "python"
26
+ pip_executable = venv_dir / "bin" / "pip"
27
+
28
+ # Upgrade pip
29
+ print("Upgrading pip...")
30
+ subprocess.run([str(python_executable), "-m", "pip", "install", "--upgrade", "pip"])
31
+
32
+ # Install requirements
33
+ print("Installing requirements...")
34
+ requirements_file = project_dir / "requirements.txt"
35
+ if requirements_file.exists():
36
+ subprocess.run([str(pip_executable), "install", "-r", "requirements.txt"])
37
+ else:
38
+ print("Warning: requirements.txt not found!")
39
+
40
+ print("\nSetup completed successfully!")
41
+ print("\nTo activate the virtual environment:")
42
+ if sys.platform == "win32":
43
+ print(f" {venv_dir}\\Scripts\\activate")
44
+ else:
45
+ print(f" source {venv_dir}/bin/activate")
46
+
47
+ if __name__ == "__main__":
48
+ setup_project()