Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,12 @@ from transformers import AutoProcessor, AutoModelForCausalLM
|
|
8 |
import re
|
9 |
from PIL import Image
|
10 |
import io
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
import subprocess
|
13 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
@@ -22,6 +28,7 @@ llm = Llama(
|
|
22 |
),
|
23 |
n_ctx=2048,
|
24 |
n_gpu_layers=100, # change n_gpu_layers if you have more or less VRAM
|
|
|
25 |
)
|
26 |
|
27 |
|
@@ -63,12 +70,23 @@ def generate_text(
|
|
63 |
yield picoutput
|
64 |
else:
|
65 |
temp = ""
|
66 |
-
system_prompt
|
|
|
|
|
|
|
|
|
67 |
for interaction in history:
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
input_prompt,
|
73 |
temperature=temperature,
|
74 |
top_p=top_p,
|
@@ -84,12 +102,16 @@ def generate_text(
|
|
84 |
"SYSTEM:",
|
85 |
"<|start_header_id|>",
|
86 |
"<|eot_id|>",
|
|
|
|
|
87 |
],
|
88 |
stream=True,
|
89 |
)
|
90 |
for out in output:
|
91 |
stream = copy.deepcopy(out)
|
92 |
-
|
|
|
|
|
93 |
yield temp
|
94 |
|
95 |
|
|
|
8 |
import re
|
9 |
from PIL import Image
|
10 |
import io
|
11 |
+
import json
|
12 |
+
import logging
|
13 |
+
|
14 |
+
# Set up logging
|
15 |
+
logging.basicConfig(level=logging.DEBUG)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
|
18 |
import subprocess
|
19 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
|
|
28 |
),
|
29 |
n_ctx=2048,
|
30 |
n_gpu_layers=100, # change n_gpu_layers if you have more or less VRAM
|
31 |
+
chat_format="llama-3",
|
32 |
)
|
33 |
|
34 |
|
|
|
70 |
yield picoutput
|
71 |
else:
|
72 |
temp = ""
|
73 |
+
# Create system_prompt as a dictionary
|
74 |
+
system_prompt = {"role": "system", "content": system_message}
|
75 |
+
|
76 |
+
# Create history_prompt as a list of dictionaries
|
77 |
+
history_prompt = []
|
78 |
for interaction in history:
|
79 |
+
user_part = {"role": "user", "content": str(interaction[0])}
|
80 |
+
assistant_part = {"role": "assistant", "content": str(interaction[1])}
|
81 |
+
history_prompt.extend([user_part, assistant_part])
|
82 |
+
|
83 |
+
# Create user_input_part as a dictionary
|
84 |
+
user_input_part = {"role": "user", "content": str(in_text)}
|
85 |
+
|
86 |
+
# Construct input_prompt as a list of dictionaries
|
87 |
+
input_prompt = [system_prompt] + history_prompt + [user_input_part]
|
88 |
+
logger.debug(f"Input Prompt: {input_prompt}")
|
89 |
+
output = llm.create_chat_completion(
|
90 |
input_prompt,
|
91 |
temperature=temperature,
|
92 |
top_p=top_p,
|
|
|
102 |
"SYSTEM:",
|
103 |
"<|start_header_id|>",
|
104 |
"<|eot_id|>",
|
105 |
+
"<im_end>",
|
106 |
+
"<|im_end|>",
|
107 |
],
|
108 |
stream=True,
|
109 |
)
|
110 |
for out in output:
|
111 |
stream = copy.deepcopy(out)
|
112 |
+
logger.debug(f"Stream: {stream}")
|
113 |
+
if 'delta' in stream['choices'][0] and 'content' in stream['choices'][0]['delta']:
|
114 |
+
temp += stream["choices"][0]["delta"]["content"]
|
115 |
yield temp
|
116 |
|
117 |
|