Spaces:

HuggingFaceTB
/

instant-smollm

Running

File size: 5,027 Bytes

<script lang="ts">
    import Textarea from "@/lib/components/ui/textarea/textarea.svelte";
    import Badge from "@/lib/components/ui/badge/badge.svelte";
    import * as webllm from "@mlc-ai/web-llm";
    import { onMount } from 'svelte';

    let selectedModel = "SmolLM-360M-Instruct-q4f16_1-MLC";

    let engine: webllm.MLCEngineInterface;
    let isLoading = false;
    let loadingStatus = '';
    let inputText = '';
    let outputText = '';
    let error = '';
    let completionSpeed: number | null = null;
    let tokensPerSecond: number | null = null;
    let isGenerating = false;
    let pendingRequest: string | null = null;

    async function loadWebLLM() {
        isLoading = true;
        error = '';
        const initProgressCallback = (report: webllm.InitProgressReport) => {
            loadingStatus = report.text;
        };

        const appConfig: webllm.AppConfig = {
            model_list: [{
                model: `https://huggingface.co/mlc-ai/SmolLM-360M-Instruct-q4f16_1-MLC`,
                model_id: 'SmolLM-360M-Instruct-q4f16_1-MLC',
                model_lib: `${webllm.modelLibURLPrefix}${webllm.modelVersion}/SmolLM-360M-Instruct-q4f16_1-ctx2k_cs1k-webgpu.wasm`,
                overrides: { context_window_size: 2048 },
            },
            {
                model: `https://huggingface.co/mlc-ai/Qwen2-0.5B-Instruct-q4f16_1-MLC`,
                model_id: 'Qwen2-0.5B-Instruct-q4f16_1-MLC',
                model_lib: `${webllm.modelLibURLPrefix}${webllm.modelVersion}/Qwen2-0.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm`,
                overrides: { context_window_size: 2048 },
            }
        ],
        };

        try {
            engine = await webllm.CreateMLCEngine(selectedModel, {
                appConfig,
                initProgressCallback,
                logLevel: "INFO",
            });
        } catch (err) {
            error = `Failed to load the model: ${(err as Error).message}`;
        } finally {
            isLoading = false;
        }
    }

    async function generateCompletion(content: string) {
        if (!engine || isGenerating) {
            /**
             * This is used to store the most recent request from user
             * while the current request is being processed.
             */
            pendingRequest = content.trim();
            return;
        }

        if (!content.trim()) return;

        isGenerating = true;
        const startTime = performance.now();
        try {
            console.log("Generating completion:", content);
            const response = await engine.chat.completions.create({
                messages: [
                    {role:"system", content: "You are a helpful AI agent helping users. Try your best to answer the users request."},
                    {role: "user", content: content}
                ],
                max_tokens: 10,
            });

            outputText = response.choices[0].message.content || "";

            // indicate that the response was cut short
            if (response.choices[0].finish_reason === "length") {
                outputText += "...";
            }

            const endTime = performance.now();
            const elapsedTimeInSeconds = (endTime - startTime) / 1000;
            completionSpeed = Math.round(endTime - startTime);
            
            const generatedTokens = response.usage?.completion_tokens || 0;
            tokensPerSecond = Math.round(generatedTokens / elapsedTimeInSeconds);
            
            error = '';
        } catch (err) {
            error = `Error: ${(err as Error).message}`;
        } finally {
            isGenerating = false;
            
            // process pending request if exists
            if (pendingRequest && pendingRequest !== content) {
                const nextRequest = pendingRequest;
                pendingRequest = null;
                await generateCompletion(nextRequest);
            }
        }
    }

    onMount(loadWebLLM);
</script>

<div class="flex my-20 flex-col items-center gap-4 max-w-lg mx-auto">
    <h1 class="text-center font-mono font-bold text-4xl">SmolLM 🤗</h1>
    <p class="text-center font-mono text-sm mb-4">Powered by {selectedModel}</p>
    <Textarea 
        bind:value={inputText} 
        on:input={() => generateCompletion(inputText)} 
        disabled={isLoading}
        class="w-full" 
        placeholder="Say something..."
    />
    {#if isLoading}
        <p class="text-sm text-slate-600 text-center">{loadingStatus}</p>
    {:else if error}
        <p class="text-sm text-red-600">{error}</p>
    {:else}
        <div class="flex gap-2">
            {#if completionSpeed !== null}
                <Badge>{completionSpeed}ms</Badge>
            {/if}
            {#if tokensPerSecond !== null}
                <Badge>{tokensPerSecond} tok/s</Badge>
            {/if}
            <Badge>{selectedModel}</Badge>
        </div>
    {/if}
    <pre class="text-lg font-bold whitespace-pre-wrap">{outputText}</pre>

</div>