tarunkumark2 commited on
Commit
7794643
1 Parent(s): 8596318
Files changed (4) hide show
  1. Dockerfile +44 -0
  2. requirements-local.txt +4 -0
  3. run_model.py +64 -0
  4. templates/index.html +231 -0
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-alpine
2
+
3
+ # Install necessary dependencies and tools
4
+ RUN apk add --no-cache build-base cmake clang git && \
5
+ rm -rf /var/cache/apk/*
6
+
7
+ # Clone the BitNet repository without history
8
+ RUN git clone --recursive --depth 1 https://github.com/microsoft/BitNet.git && \
9
+ rm -rf BitNet/.git
10
+
11
+ WORKDIR /BitNet
12
+
13
+ # Install Python dependencies
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy the local requirements.txt for additional dependencies
17
+ COPY requirements-local.txt .
18
+
19
+ # Install additional dependencies from the local requirements file
20
+ RUN pip install --no-cache-dir -r requirements-local.txt
21
+
22
+ # Run the code generation for Llama3-8B model
23
+ RUN python3 utils/codegen_tl2.py --model Llama3-8B-1.58-100B-tokens --BM 256,128,256,128 --BK 96,96,96,96 --bm 32,32,32,32
24
+
25
+ # Build the model using cmake with specified compilers
26
+ RUN cmake -B build -DBITNET_X86_TL2=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
27
+
28
+ RUN cmake --build build --config Release
29
+
30
+ # Download the Llama model from HuggingFace
31
+ ADD https://huggingface.co/brunopio/Llama3-8B-1.58-100B-tokens-GGUF/resolve/main/Llama3-8B-1.58-100B-tokens-TQ2_0.gguf .
32
+
33
+ # Verify the integrity of the model file
34
+ RUN echo "2565559c82a1d03ecd1101f536c5e99418d07e55a88bd5e391ed734f6b3989ac Llama3-8B-1.58-100B-tokens-TQ2_0.gguf" | sha256sum -c
35
+
36
+ # Expose port for communication with the Node.js app
37
+ EXPOSE 7860
38
+
39
+ # Run a Python script that handles queries from the Node.js app using socket.io
40
+ COPY . .
41
+
42
+ # COPY templates/* .
43
+ # Run the model in inference mode, listening for queries
44
+ CMD ["python3", "run_model.py"]
requirements-local.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ flask
2
+ flask-socketio
3
+ requests
4
+ eventlet
run_model.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template
2
+ from flask_socketio import SocketIO, emit
3
+ import subprocess
4
+ import threading
5
+
6
+ app = Flask(__name__)
7
+ socketio = SocketIO(app, cors_allowed_origins='*')
8
+
9
+ # Global variable to manage thread control
10
+ stop_event = threading.Event()
11
+ # Global variable to store the process reference
12
+ current_process = None
13
+
14
+ def stream_process_output(command):
15
+ """Execute a command and emit stdout line by line, with thread control."""
16
+ global current_process
17
+
18
+ # Start the subprocess
19
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
20
+ current_process = process # Store the reference to the current process
21
+
22
+ for stdout_line in process.stdout:
23
+ if stop_event.is_set(): # Stop if the event is triggered
24
+ break
25
+ socketio.emit('response', {'word': stdout_line})
26
+ socketio.sleep(0.1) # Yield to allow other threads to run
27
+
28
+ process.stdout.close()
29
+ process.wait()
30
+
31
+ @socketio.on('query')
32
+ def start_stream(data=None):
33
+ """Start the process and stream its stdout to the client, ensuring thread control."""
34
+ global stop_event, current_process
35
+
36
+ if data is None:
37
+ return
38
+
39
+ query = data['query']
40
+
41
+ command = ['python3', 'run_inference.py', '-m', 'Llama3-8B-1.58-100B-tokens-TQ2_0.gguf', '-p', query]
42
+
43
+ if 'args' in data and data.get('args'):
44
+ additional_args = data['args'].strip().split()
45
+ command.extend(additional_args)
46
+
47
+ print(f"command- {command}")
48
+
49
+ # If there is an existing running task, terminate it
50
+ if current_process and current_process.poll() is None:
51
+ current_process.terminate()
52
+
53
+ stop_event.set() # Signal the current thread to stop
54
+ stop_event.clear() # Reset the stop event for the new thread
55
+
56
+ # Start a new background task
57
+ socketio.start_background_task(target=stream_process_output, command=command)
58
+
59
+ @app.route('/')
60
+ def index():
61
+ return render_template('index.html')
62
+
63
+ if __name__ == '__main__':
64
+ socketio.run(app, host='0.0.0.0', port=7860)
templates/index.html ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>BitNet 1-Bit LLM Query Interface</title>
8
+ <style>
9
+ body {
10
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
11
+ margin: 0;
12
+ padding: 0;
13
+ background-color: #f4f4f9;
14
+ display: flex;
15
+ justify-content: center;
16
+ align-items: center;
17
+ height: 100vh;
18
+ }
19
+
20
+ .container {
21
+ display: flex;
22
+ max-width: 1200px;
23
+ width: 100%;
24
+ background-color: #fff;
25
+ border-radius: 8px;
26
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
27
+ overflow: hidden;
28
+ }
29
+
30
+ .left-section {
31
+ flex: 1;
32
+ padding: 20px;
33
+ background-color: white;
34
+ color: black;
35
+ display: flex;
36
+ flex-direction: column;
37
+ justify-content: space-between;
38
+ box-sizing: border-box;
39
+ }
40
+
41
+ .left-section h2 {
42
+ margin-bottom: 20px;
43
+ font-size: 1.5rem;
44
+ }
45
+
46
+ .input-group {
47
+ margin-bottom: 15px;
48
+ }
49
+
50
+ .input-group label {
51
+ display: block;
52
+ font-size: 0.9rem;
53
+ margin-bottom: 8px;
54
+ }
55
+
56
+ .input-group input,
57
+ .input-group textarea {
58
+ width: 100%;
59
+ padding: 10px;
60
+ border-radius: 4px;
61
+ font-size: 0.95rem;
62
+ box-sizing: border-box;
63
+ }
64
+
65
+ input[type="number"],
66
+ textarea {
67
+ width: 100%;
68
+ border: 1px solid #ccc;
69
+ border-radius: 4px;
70
+ box-sizing: border-box;
71
+ }
72
+
73
+ input[type="number"]:focus,
74
+ textarea:focus {
75
+ border-color: #238a95;
76
+ outline: none;
77
+ box-shadow: 0 0 5px rgba(35, 138, 149, 0.5);
78
+ }
79
+
80
+ input[type="number"]::placeholder,
81
+ textarea::placeholder {
82
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
83
+ }
84
+
85
+ .input-group textarea {
86
+ resize: none;
87
+ }
88
+
89
+ .button-group {
90
+ margin-top: 20px;
91
+ margin: auto;
92
+ }
93
+
94
+ .button-group button {
95
+ padding: 10px 20px;
96
+ background-color: #238a95;
97
+ border: none;
98
+ color: white;
99
+ cursor: pointer;
100
+ border-radius: 4px;
101
+ font-size: 1rem;
102
+ transition: background-color 0.3s, transform 0.2s;
103
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
104
+ }
105
+
106
+ .button-group button:hover {
107
+ background-color: #1e7a84;
108
+ transform: scale(1.05);
109
+ }
110
+
111
+ .right-section {
112
+ flex: 2;
113
+ padding: 20px;
114
+ background-color: #ffffff;
115
+ border-left: 2px solid #dbdbdb;
116
+ box-sizing: border-box;
117
+ }
118
+
119
+ .right-section h2 {
120
+ margin-bottom: 20px;
121
+ }
122
+
123
+ #response {
124
+ height: 430px;
125
+ border: 1px solid #dbdbdb;
126
+ border-radius: 4px;
127
+ padding: 10px;
128
+ overflow-y: auto;
129
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
130
+ background-color: #f5f4f459;
131
+ font-size: 0.95rem;
132
+ }
133
+
134
+ /* Responsive design */
135
+ @media screen and (max-width: 768px) {
136
+ .container {
137
+ flex-direction: column;
138
+ }
139
+
140
+ .right-section {
141
+ border-left: none;
142
+ border-top: 2px solid #dbdbdb;
143
+ }
144
+ }
145
+ </style>
146
+ </head>
147
+
148
+ <body>
149
+ <div class="container">
150
+ <div class="left-section">
151
+ <h2>Command Options</h2>
152
+ <div class="input-group">
153
+ <label for="tokens">Number of tokens to predict</label>
154
+ <input type="number" id="tokens" min="0" placeholder="Enter number of tokens">
155
+ </div>
156
+ <div class="input-group">
157
+ <label for="threads">Number of threads to use</label>
158
+ <input type="number" id="threads" min="0" placeholder="Enter number of threads">
159
+ </div>
160
+ <div class="input-group">
161
+ <label for="context-size">Size of the prompt context</label>
162
+ <input type="number" id="context-size" min="0" placeholder="Enter context size">
163
+ </div>
164
+ <div class="input-group">
165
+ <label for="temperature">Temperature, a hyperparameter that controls the randomness of the generated text</label>
166
+ <input type="number" min="0" id="temperature" placeholder="Enter temperature value">
167
+ </div>
168
+ <div class="input-group">
169
+ <label for="prompt">Prompt</label>
170
+ <textarea id="prompt" rows="4" min="0" placeholder="Enter your prompt"></textarea>
171
+ </div>
172
+ <div class="button-group">
173
+ <button onclick="sendQuery()">Send Query</button>
174
+ </div>
175
+ </div>
176
+ <div class="right-section">
177
+ <h2>Response</h2>
178
+ <div id="response"></div>
179
+ </div>
180
+ </div>
181
+
182
+ <script type="module">
183
+ import { io } from "https://cdn.socket.io/4.8.0/socket.io.esm.min.js";
184
+ const socket = io();
185
+
186
+ window.sendQuery = function() {
187
+ const tokens = document.getElementById('tokens').value;
188
+ const threads = document.getElementById('threads').value;
189
+ const contextSize = document.getElementById('context-size').value;
190
+ const temperature = document.getElementById('temperature').value;
191
+ const prompt = document.getElementById('prompt').value;
192
+
193
+ if (!prompt) {
194
+ return alert('There is no prompt to send!');
195
+ }
196
+
197
+ let args = '';
198
+
199
+ if (tokens && !isNaN(tokens) && tokens > -1) {
200
+ args += ` -n ${tokens}`;
201
+ }
202
+
203
+ if (threads && !isNaN(threads) && threads > -1) {
204
+ args += ` -t ${threads}`;
205
+ }
206
+
207
+ if (contextSize && !isNaN(contextSize) && contextSize > -1) {
208
+ args += ` -c ${contextSize}`;
209
+ }
210
+
211
+ if (temperature && !isNaN(temperature) && temperature > -1) {
212
+ args += ` -temp ${temperature}`;
213
+ }
214
+
215
+ // Clear previous response
216
+ document.getElementById('response').innerText = '';
217
+
218
+ // Emit query with all parameters
219
+ socket.emit('query', { query: prompt, args });
220
+ }
221
+
222
+ socket.on('response', function (word) {
223
+ const responseDiv = document.getElementById('response');
224
+ responseDiv.innerText += word.word + ' ';
225
+ // Scroll to the bottom of the response div
226
+ responseDiv.scrollTop = responseDiv.scrollHeight;
227
+ });
228
+ </script>
229
+ </body>
230
+
231
+ </html>