Spaces:

ch-outcomes-ai
/

Evaluate_ASR

Sleeping

App Files Files Community

chenhaodev commited on Feb 12

Commit

e021c17

1 Parent(s): 1bafebd

use llama-cpp-python instead of ollama

Browse files

Files changed (3) hide show

Dockerfile +16 -64
app.py +30 -40
requirements.txt +1 -1

Dockerfile CHANGED Viewed

@@ -1,79 +1,31 @@
-# Use Ubuntu as base image
-FROM ubuntu:22.04
-# Prevent interactive prompts during package installation
-ENV DEBIAN_FRONTEND=noninteractive
 # Install system dependencies
-RUN apt-get update && apt-get install -y \
-    python3 \
-    python3-pip \
-    curl \
-    wget \
     git \
-    net-tools \
     && rm -rf /var/lib/apt/lists/*
-# Install Ollama
-RUN curl -fsSL https://ollama.com/install.sh | sh
-# Set working directory
 WORKDIR /app
-# Copy requirements and install Python dependencies
 COPY requirements.txt .
-RUN pip3 install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
-# Create startup script with health checks and retries
-RUN echo '#!/bin/bash\n\
-\n\
-# Function to check if Ollama is responsive\n\
-check_ollama() {\n\
-    curl -s http://localhost:11434/api/version &>/dev/null\n\
-}\n\
-\n\
-# Start Ollama server\n\
-ollama serve & \n\
-\n\
-# Wait for Ollama to be responsive (up to 60 seconds)\n\
-count=0\n\
-while ! check_ollama && [ $count -lt 60 ]; do\n\
-    echo "Waiting for Ollama server to start..."\n\
-    sleep 1\n\
-    count=$((count + 1))\n\
-done\n\
-\n\
-if ! check_ollama; then\n\
-    echo "Failed to start Ollama server"\n\
-    exit 1\n\
-fi\n\
-\n\
-# Pull the model with retry logic\n\
-max_retries=3\n\
-retry_count=0\n\
-while [ $retry_count -lt $max_retries ]; do\n\
-    if ollama pull deepseek-r1:1.5b; then\n\
-        break\n\
-    fi\n\
-    echo "Failed to pull model, retrying..."\n\
-    retry_count=$((retry_count + 1))\n\
-    sleep 5\n\
-done\n\
-\n\
-if [ $retry_count -eq $max_retries ]; then\n\
-    echo "Failed to pull model after $max_retries attempts"\n\
-    exit 1\n\
-fi\n\
-\n\
-# Start the Gradio app\n\
-exec python3 -u app.py\n\
-' > start.sh && chmod +x start.sh
-# Expose port for Gradio web interface
 EXPOSE 7860
 # Run the application
-ENTRYPOINT ["./start.sh"]

+FROM python:3.10-slim
 # Install system dependencies
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    python3-dev \
     git \
+    wget \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+# Copy requirements first for better caching
 COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Create model directory
+RUN mkdir -p /app/models
+# Download the GGUF model (replace with your preferred Qwen GGUF model)
+RUN wget -P /app/models https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/blob/main/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf
 # Copy application code
 COPY . .
+# Expose port for Gradio
 EXPOSE 7860
 # Run the application
+CMD ["python3", "app.py"]

app.py CHANGED Viewed

@@ -3,46 +3,33 @@ import jiwer
 import pandas as pd
 import logging
 from typing import List, Optional, Tuple, Dict
-from ollama import Client
-import re
 import os
-import time
-import requests
-# Set up logging configuration
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s',
     force=True,
-    handlers=[
-        logging.StreamHandler(),
-    ]
 )
 logger = logging.getLogger(__name__)
-# Initialize Ollama client with retry logic
-def init_ollama_client(max_retries=5):
-    client = None
-    for i in range(max_retries):
-        try:
-            client = Client(host='http://localhost:11434')
-            # Test the connection
-            response = requests.get('http://localhost:11434/api/version')
-            if response.status_code == 200:
-                logger.info("Successfully connected to Ollama")
-                return client
-        except Exception as e:
-            logger.warning(f"Attempt {i+1}/{max_retries} to connect to Ollama failed: {str(e)}")
-            if i < max_retries - 1:
-                time.sleep(2)
-    raise Exception("Failed to initialize Ollama client")
-# Global client initialization
 try:
-    client = init_ollama_client()
 except Exception as e:
-    logger.error(f"Failed to initialize Ollama: {str(e)}")
-    client = None
 def calculate_wer_metrics(
     hypothesis: str,
@@ -124,29 +111,30 @@ def calculate_wer_metrics(
     return measures
 def extract_medical_terms(text: str) -> List[str]:
-    """Extract medical terms from text using Qwen model via Ollama."""
-    if client is None:
-        logger.error("Ollama client not initialized")
         return []
     prompt = f"""Extract all medical terms from the following text.
     Return only the medical terms as a comma-separated list.
     Text: {text}"""
     try:
-        response = client.generate(
-            model='deepseek-r1:1.5b',
-            prompt=prompt,
-            stream=False
         )
-        response_text = response['response']
-        # Remove the thinking process
         if '<think>' in response_text and '</think>' in response_text:
             medical_terms_text = response_text.split('</think>')[-1].strip()
         else:
             medical_terms_text = response_text
         medical_terms = [term.strip() for term in medical_terms_text.split(',')]
         return [term for term in medical_terms if term and not term.startswith('<') and not term.endswith('>')]
@@ -198,9 +186,12 @@ def process_inputs(
     try:
         # Extract medical terms
         reference_terms = extract_medical_terms(reference)
         hypothesis_terms = extract_medical_terms(hypothesis)
         # Calculate medical recall
         med_recall = calculate_medical_recall(hypothesis_terms, reference_terms)
@@ -332,7 +323,6 @@ def create_interface() -> gr.Blocks:
             inputs=[reference, hypothesis, normalize, words_to_filter],
             outputs=[metrics_output, error_output, explanation_output, error_msg_output]
         )
     return interface
 if __name__ == "__main__":

 import pandas as pd
 import logging
 from typing import List, Optional, Tuple, Dict
+from llama_cpp import Llama
 import os
+# Set up logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(levelname)s - %(message)s',
     force=True,
+    handlers=[logging.StreamHandler()]
 )
 logger = logging.getLogger(__name__)
+# Initialize LLM
+MODEL_PATH = "/app/models/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
 try:
+    llm = Llama(
+        model_path=MODEL_PATH,
+        n_ctx=2048,        # Context window
+        n_threads=4,       # CPU threads
+        n_batch=512,       # Batch size
+        verbose=False      # Disable verbose output
+    )
+    logger.info("LLM initialized successfully")
 except Exception as e:
+    logger.error(f"Failed to initialize LLM: {str(e)}")
+    llm = None
 def calculate_wer_metrics(
     hypothesis: str,
     return measures
 def extract_medical_terms(text: str) -> List[str]:
+    """Extract medical terms from text using Qwen model."""
+    if llm is None:
+        logger.error("LLM not initialized")
         return []
     prompt = f"""Extract all medical terms from the following text.
     Return only the medical terms as a comma-separated list.
     Text: {text}"""
     try:
+        response = llm(
+            prompt,
+            max_tokens=256,
+            temperature=0.1,
+            stop=["Text:", "\n\n"],
+            echo=False
         )
+        response_text = response['choices'][0]['text'].strip()
+        # Remove thinking process if present
         if '<think>' in response_text and '</think>' in response_text:
             medical_terms_text = response_text.split('</think>')[-1].strip()
         else:
             medical_terms_text = response_text
         medical_terms = [term.strip() for term in medical_terms_text.split(',')]
         return [term for term in medical_terms if term and not term.startswith('<') and not term.endswith('>')]
     try:
         # Extract medical terms
+        logger.info("Extracting medical terms from reference text...")
         reference_terms = extract_medical_terms(reference)
+        logger.info(f"Reference terms extracted: {reference_terms}")
+        logger.info("Extracting medical terms from hypothesis text...")
         hypothesis_terms = extract_medical_terms(hypothesis)
+        logger.info(f"Hypothesis terms extracted: {hypothesis_terms}")
         # Calculate medical recall
         med_recall = calculate_medical_recall(hypothesis_terms, reference_terms)
             inputs=[reference, hypothesis, normalize, words_to_filter],
             outputs=[metrics_output, error_output, explanation_output, error_msg_output]
         )
     return interface
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 gradio==5.16.0
 jiwer==3.1.0
 pandas==2.2.0
-ollama==0.4.5

 gradio==5.16.0
 jiwer==3.1.0
 pandas==2.2.0
+llama-cpp-python