Spaces:

Anja97
/

prompt-search-engine

Runtime error

App Files Files Community

Anja97 commited on Nov 6, 2024

Commit

adad4ac

1 Parent(s): 3da5ff8

Initial commit with cleaned project files

Browse files

Files changed (7) hide show

Dockerfile +43 -0
README.md +54 -0
api.py +49 -0
prompt_search_engine.py +16 -0
requirements.txt +5 -0
similarity.py +6 -0
vectorizer.py +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Install git and git-lfs
+RUN apt-get update && apt-get install -y git git-lfs && git lfs install
+# Create a non-root user 'appuser'
+RUN useradd -ms /bin/bash appuser
+# Set the working directory
+WORKDIR /home/appuser/app
+# Copy requirements file
+COPY requirements.txt .
+# Install required packages
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Set environment variables for cache directories
+ENV HF_HOME=/home/appuser/app/.cache
+ENV HF_DATASETS_CACHE=/home/appuser/app/.cache
+# Create the cache directory
+RUN mkdir -p /home/appuser/app/.cache
+# Change ownership of the application files
+RUN chown -R appuser:appuser /home/appuser/app
+# Switch to non-root user
+USER appuser
+# Pre-download models and datasets
+RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+RUN python -c "from datasets import load_dataset; load_dataset('Gustavosta/Stable-Diffusion-Prompts')"
+# Expose port 7860
+EXPOSE 7860
+# Command to run the API
+CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -9,3 +9,57 @@ short_description: Improve image quality with better prompts!
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Prompt Search Engine
+## Overview
+This project implements a prompt search engine for Stable Diffusion models. The search engine allows users to input a prompt and returns the top `n` most similar prompts from a corpus of existing prompts. This helps in generating higher quality images by providing more effective prompts.
+The search engine consists of two main components:
+- **Prompt Vectorizer**: Converts prompts into numerical vectors using a pre-trained embedding model.
+- **Similarity Scorer**: Measures the similarity between the input prompt and existing prompts using cosine similarity.
+## Setup Instructions
+### Requirements
+- Python >= 3.9
+- pip
+### Installation
+1. **Clone the repository**
+   ```bash
+   git clone <repository-url>
+   cd <repository-directory>
+   ```
+2. **Create a virtual environment (optional)**
+   ```bash
+   python -m venv venv
+   source venv/bin/activate
+   ```
+3. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+## Running the `run.py` script
+The `run.py` script allows you to run the prompt search engine from the command line.
+### Usage
+   ```bash
+   python run.py --query "Your query prompt here" --n 5 --model "all-MiniLM-L6-v2"
+   ```
+### Arguments
+- `--query`: The query prompt (required).
+- `--n`: The number of similar prompts to return (default 5).
+- `--model`: The name of the SBERT model to use (default "all-MiniLM-L6-v2").
+### Example
+`python run.py --query "A cat wearing glasses, sitting at a computer" --n 7`

api.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# api.py
+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List, Tuple
+from prompt_search_engine import PromptSearchEngine
+from vectorizer import Vectorizer
+from datasets import load_dataset
+# Define the request and response models
+class QueryRequest(BaseModel):
+    query: str
+    n: int = 5  # default value
+class QueryResponse(BaseModel):
+    results: List[Tuple[float, str]]
+# Initialize FastAPI app
+app = FastAPI()
+# Global variable to store the search engine
+search_engine = None
+# Load prompts and initialize the search engine when the app starts
+@app.on_event("startup")
+def startup_event():
+    global search_engine
+    # Load the prompts
+    dataset = load_dataset("Gustavosta/Stable-Diffusion-Prompts")
+    prompts = dataset["train"]["Prompt"]
+    # For testing, limit the number of prompts
+    prompts = prompts[:1000]  # Adjust the number as needed
+    # Initialize vectorizer with the default model
+    vectorizer = Vectorizer(model="all-MiniLM-L6-v2")
+    # Initialize the search engine
+    search_engine = PromptSearchEngine(prompts, vectorizer)
+# Define the /search endpoint
+@app.post("/search")
+def search_prompts(request: QueryRequest):
+    global search_engine
+    if search_engine is None:
+        return {"results": []}
+    # Get the top-n most similar prompts
+    similar_prompts = search_engine.most_similar(query=request.query, n=request.n)
+    # Prepare the response
+    results = [{"score": float(score), "prompt": prompt} for score, prompt in similar_prompts]
+    return {"results": results}

prompt_search_engine.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List, Tuple, Sequence
+import numpy as np
+from vectorizer import Vectorizer
+from similarity import cosine_similarity
+class PromptSearchEngine:
+    def __init__(self, prompts: Sequence[str], vectorizer: Vectorizer) -> None:
+        self.prompts = prompts
+        self.vectorizer = vectorizer
+        self.corpus_vectors = vectorizer.transform(prompts)
+    def most_similar(self, query, n = 5) -> List[Tuple[float, str]]:
+        query_vector = self.vectorizer.transform([query])[0]
+        similarities = cosine_similarity(query_vector, self.corpus_vectors)
+        top_indices = similarities.argsort()[-n:][::-1]
+        return [(similarities[i], self.prompts[i]) for i in top_indices]

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+sentence-transformers
+numpy
+datasets
+fastapi
+uvicorn

similarity.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import numpy as np
+def cosine_similarity(query_vector: np.ndarray, corpus_vectors: np.ndarray) -> np.ndarray:
+    query_norm = query_vector / np.linalg.norm(query_vector)
+    corpus_norm = corpus_vectors / np.linalg.norm(corpus_vectors, axis=1, keepdims=True)
+    return np.dot(corpus_norm, query_norm)

vectorizer.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from sentence_transformers import SentenceTransformer
+import numpy as np
+from typing import Sequence
+class Vectorizer:
+    def __init__(self, model) -> None:
+        self.model = SentenceTransformer(model)
+    def transform(self, prompts: Sequence[str]) -> np.ndarray:
+        return self.model.encode(prompts)