Spaces:

jbilcke-hf
/

webapp-factory-llama-node

Running on CPU Upgrade

App Files Files Community

webapp-factory-llama-node / src /index.mts

jbilcke-hf HF staff

fix

d40d743 over 1 year ago

raw

history blame contribute delete

4.75 kB

	import express from "express"
	import { LLM } from "llama-node"
	import { LLamaCpp } from "llama-node/dist/llm/llama-cpp.js"

	import { daisy } from "./daisy.mts"
	import { alpine } from "./alpine.mts"

	const llama = new LLM(LLamaCpp)
	await llama.load({
	// If you plan to use a different model you also need to edit line 26 in the Dockerfile
	modelPath: "./models/airoboros-13b-gpt4.ggmlv3.q4_0.bin",
	enableLogging: false,
	nCtx: 1024,
	seed: 0,
	f16Kv: false,
	logitsAll: false,
	vocabOnly: false,
	useMlock: false,
	embedding: false,
	useMmap: true,
	nGpuLayers: 0
	})

	// define the CSS and JS dependencies
	const css = [
	"/css/daisyui@2.6.0.css",
	].map(item => `<link href="${item}" rel="stylesheet" type="text/css"/>`)
	.join("")

	const script = [
	"/js/alpinejs@3.12.2.js",
	"/js/tailwindcss@3.3.2.js"
	].map(item => `<script src="${item}"></script>`)
	.join("")

	const app = express()
	const port = 7860

	const minPromptSize = 16 // if you change this, you will need to also change in public/index.html
	const timeoutInSec = 15 * 60

	app.use(express.static("public"))

	const maxParallelRequests = 1

	const pending: {
	total: number;
	queue: string[];
	aborts: Record<string, any>,
	} = {
	total: 0,
	queue: [],
	aborts: {},
	}

	const endRequest = (id: string, reason: string) => {
	if (!id \|\| !pending.queue.includes(id)) {
	return
	}

	// politely ask the LLM to stop
	try {
	pending.aborts[id].abort()
	} catch (err) {
	console.log(`could not abort request ${id} (${err})`)
	}
	// remove the request from everywhere
	try {
	pending.queue = pending.queue.filter(i => i !== id)
	delete pending.aborts[id]
	console.log(`cleaned up request ${id}`)
	} catch (err) {
	console.log(`failed to properly clean up request ${id}`)
	}
	console.log(`request ${id} ended (${reason})`)
	}
	app.get("/debug", (req, res) => {
	res.write(JSON.stringify({
	nbTotal: pending.total,
	nbPending: pending.queue.length,
	queue: pending.queue,
	}))
	res.end()
	})

	app.get("/app", async (req, res) => {

	if (`${req.query.prompt}`.length < minPromptSize) {
	res.write(`prompt too short, please enter at least ${minPromptSize} characters`)
	res.end()
	return
	}

	// naive implementation: we say we are out of capacity
	if (pending.queue.length >= maxParallelRequests) {
	res.write('Sorry, max nb of parallel requests reached. A new slot should be available in < 15 min.')
	res.end()
	return
	}
	// alternative approach: kill old queries
	// while (pending.queue.length > maxParallelRequests) {
	// endRequest(pending.queue[0], 'max nb of parallel request reached')
	// }

	const id = `${pending.total++}`
	console.log(`new request ${id}`)

	pending.queue.push(id)
	pending.aborts[id] = new AbortController()

	const prefix = `<html><head>${css}${script}`
	res.write(prefix)

	req.on("close", function() {
	endRequest(id, "browser ended the connection")
	})

	// for testing we kill after some delay
	setTimeout(() => {
	endRequest(id, `timed out after ${timeoutInSec}s`)
	}, timeoutInSec * 1000)


	const finalPrompt = `# Context
	Generate this webapp: ${req.query.prompt}.
	# Documentation
	${daisy}
	# Guidelines
	- Never repeat the instruction, instead directly write the final code within a script tag
	- Use a color scheme consistent with the brief and theme
	- You need to use Tailwind CSS and DaisyUI for the UI. Do not use JS for simple pages (eg. blogs or articles).
	- All the JS code will be written directly inside the page, using <script type="text/javascript">...</script>
	- You MUST use English not Latin! I repeat: do NOT write lorem ipsum!
	- No need to write code comments, and try to make the code compact (short function names etc)
	- Use a central layout by wrapping everything in a \`<div class="flex flex-col items-center">\`
	# HTML Code
	${prefix}`

	const options = {
	prompt: finalPrompt,
	nThreads: 6, // try to use the most of our vCPUs
	nTokPredict: 1024,
	topK: 40,
	topP: 0.1,
	temp: 0.3,
	repeatPenalty: 1,
	}

	try {
	await llama.createCompletion(options, (response) => {
	try {
	res.write(response.token)
	} catch (err) {
	console.log(`coudln't write the LLM response to the HTTP stream ${err}`)
	}
	}, pending.aborts[id].signal)
	endRequest(id, `normal end of the llama stream for request ${id}`)
	} catch (e) {
	endRequest(id, `premature end of the llama stream for request ${id} (${e})`)
	}

	try {
	res.end()
	} catch (err) {
	console.log(`couldn't end the HTTP stream for request ${id} (${err})`)
	}

	})

	app.listen(port, () => { console.log(`Open http://localhost:${port}/?prompt=a%20webpage%20recipe%20for%20making%20chocolate%20chip%20cookies`) })