Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,753 Bytes
7fefeaf d9b3577 7fefeaf d9b3577 7fefeaf 9a56e2f 7fefeaf 0cf119e 7fefeaf d40d743 0cf119e 7fefeaf d9b3577 7fefeaf d40d743 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import express from "express"
import { LLM } from "llama-node"
import { LLamaCpp } from "llama-node/dist/llm/llama-cpp.js"
import { daisy } from "./daisy.mts"
import { alpine } from "./alpine.mts"
const llama = new LLM(LLamaCpp)
await llama.load({
// If you plan to use a different model you also need to edit line 26 in the Dockerfile
modelPath: "./models/airoboros-13b-gpt4.ggmlv3.q4_0.bin",
enableLogging: false,
nCtx: 1024,
seed: 0,
f16Kv: false,
logitsAll: false,
vocabOnly: false,
useMlock: false,
embedding: false,
useMmap: true,
nGpuLayers: 0
})
// define the CSS and JS dependencies
const css = [
"/css/daisyui@2.6.0.css",
].map(item => `<link href="${item}" rel="stylesheet" type="text/css"/>`)
.join("")
const script = [
"/js/alpinejs@3.12.2.js",
"/js/tailwindcss@3.3.2.js"
].map(item => `<script src="${item}"></script>`)
.join("")
const app = express()
const port = 7860
const minPromptSize = 16 // if you change this, you will need to also change in public/index.html
const timeoutInSec = 15 * 60
app.use(express.static("public"))
const maxParallelRequests = 1
const pending: {
total: number;
queue: string[];
aborts: Record<string, any>,
} = {
total: 0,
queue: [],
aborts: {},
}
const endRequest = (id: string, reason: string) => {
if (!id || !pending.queue.includes(id)) {
return
}
// politely ask the LLM to stop
try {
pending.aborts[id].abort()
} catch (err) {
console.log(`could not abort request ${id} (${err})`)
}
// remove the request from everywhere
try {
pending.queue = pending.queue.filter(i => i !== id)
delete pending.aborts[id]
console.log(`cleaned up request ${id}`)
} catch (err) {
console.log(`failed to properly clean up request ${id}`)
}
console.log(`request ${id} ended (${reason})`)
}
app.get("/debug", (req, res) => {
res.write(JSON.stringify({
nbTotal: pending.total,
nbPending: pending.queue.length,
queue: pending.queue,
}))
res.end()
})
app.get("/app", async (req, res) => {
if (`${req.query.prompt}`.length < minPromptSize) {
res.write(`prompt too short, please enter at least ${minPromptSize} characters`)
res.end()
return
}
// naive implementation: we say we are out of capacity
if (pending.queue.length >= maxParallelRequests) {
res.write('Sorry, max nb of parallel requests reached. A new slot should be available in < 15 min.')
res.end()
return
}
// alternative approach: kill old queries
// while (pending.queue.length > maxParallelRequests) {
// endRequest(pending.queue[0], 'max nb of parallel request reached')
// }
const id = `${pending.total++}`
console.log(`new request ${id}`)
pending.queue.push(id)
pending.aborts[id] = new AbortController()
const prefix = `<html><head>${css}${script}`
res.write(prefix)
req.on("close", function() {
endRequest(id, "browser ended the connection")
})
// for testing we kill after some delay
setTimeout(() => {
endRequest(id, `timed out after ${timeoutInSec}s`)
}, timeoutInSec * 1000)
const finalPrompt = `# Context
Generate this webapp: ${req.query.prompt}.
# Documentation
${daisy}
# Guidelines
- Never repeat the instruction, instead directly write the final code within a script tag
- Use a color scheme consistent with the brief and theme
- You need to use Tailwind CSS and DaisyUI for the UI. Do not use JS for simple pages (eg. blogs or articles).
- All the JS code will be written directly inside the page, using <script type="text/javascript">...</script>
- You MUST use English not Latin! I repeat: do NOT write lorem ipsum!
- No need to write code comments, and try to make the code compact (short function names etc)
- Use a central layout by wrapping everything in a \`<div class="flex flex-col items-center">\`
# HTML Code
${prefix}`
const options = {
prompt: finalPrompt,
nThreads: 6, // try to use the most of our vCPUs
nTokPredict: 1024,
topK: 40,
topP: 0.1,
temp: 0.3,
repeatPenalty: 1,
}
try {
await llama.createCompletion(options, (response) => {
try {
res.write(response.token)
} catch (err) {
console.log(`coudln't write the LLM response to the HTTP stream ${err}`)
}
}, pending.aborts[id].signal)
endRequest(id, `normal end of the llama stream for request ${id}`)
} catch (e) {
endRequest(id, `premature end of the llama stream for request ${id} (${e})`)
}
try {
res.end()
} catch (err) {
console.log(`couldn't end the HTTP stream for request ${id} (${err})`)
}
})
app.listen(port, () => { console.log(`Open http://localhost:${port}/?prompt=a%20webpage%20recipe%20for%20making%20chocolate%20chip%20cookies`) })
|