Spaces:

Marroco93
/

PacmanAI-2

Sleeping

App Files Files Community

Marroco93 commited on Mar 29, 2024

Commit

9441c54

1 Parent(s): d0c61b6

s

Browse files

Files changed (1) hide show

main.py +21 -31

main.py CHANGED Viewed

@@ -3,11 +3,11 @@ from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from huggingface_hub import InferenceClient
 import uvicorn
-import json  # Make sure to import json
 app = FastAPI()
 client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 class Item(BaseModel):
@@ -15,7 +15,7 @@ class Item(BaseModel):
     history: list
     system_prompt: str
     temperature: float = 0.0
-    max_new_tokens: int = 1048
     top_p: float = 0.15
     repetition_penalty: float = 1.0
@@ -27,40 +27,30 @@ def format_prompt(message, history):
     prompt += f"[INST] {message} [/INST]"
     return prompt
-import json  # Import the JSON module
-def generate(item: Item):
-    temperature = float(item.temperature)
-    if temperature < 1e-2:
-        temperature = 1e-2
-    top_p = float(item.top_p)
-    generate_kwargs = dict(
-        temperature=temperature,
-        max_new_tokens=item.max_new_tokens,
-        top_p=top_p,
-        repetition_penalty=item.repetition_penalty,
-        do_sample=True,
-        seed=42,
-    )
     formatted_prompt = format_prompt(f"{item.system_prompt}, {item.prompt}", item.history)
-    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
-    # Convert stream to a list to check if it's the last element
-    responses = list(stream)
-    for i, response in enumerate(responses):
-        # Prepare the chunk as a JSON object
         chunk = {
             "text": response.token.text,
-            "complete": i == len(responses) - 1  # True if this is the last chunk
         }
-        # Yield the JSON-encoded string with a newline to separate chunks
         yield json.dumps(chunk).encode("utf-8") + b"\n"
 @app.post("/generate/")
 async def generate_text(item: Item):
-    return StreamingResponse(generate(item), media_type="application/x-ndjson")

 from pydantic import BaseModel
 from huggingface_hub import InferenceClient
 import uvicorn
+from typing import Generator
 app = FastAPI()
+# Initialize the InferenceClient with your model
 client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 class Item(BaseModel):
     history: list
     system_prompt: str
     temperature: float = 0.0
+    max_new_tokens: int = 9000
     top_p: float = 0.15
     repetition_penalty: float = 1.0
     prompt += f"[INST] {message} [/INST]"
     return prompt
+def generate_stream(item: Item) -> Generator[bytes, None, None]:
     formatted_prompt = format_prompt(f"{item.system_prompt}, {item.prompt}", item.history)
+    generate_kwargs = {
+        "temperature": item.temperature,
+        "max_new_tokens": item.max_new_tokens,
+        "top_p": item.top_p,
+        "repetition_penalty": item.repetition_penalty,
+        "do_sample": True,
+        "seed": 42,  # Adjust or omit the seed as needed
+    }
+    # Stream the response from the InferenceClient
+    for response in client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True):
+        # This assumes 'details=True' gives you a structure where you can access the text like this
         chunk = {
             "text": response.token.text,
+            "complete": response.generated_text is not None  # Adjust based on how you detect completion
         }
         yield json.dumps(chunk).encode("utf-8") + b"\n"
 @app.post("/generate/")
 async def generate_text(item: Item):
+    # Stream response back to the client
+    return StreamingResponse(generate_stream(item), media_type="application/x-ndjson")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)