Update app.py
Browse files
app.py
CHANGED
@@ -3,20 +3,31 @@ import torch
|
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
|
5 |
# Define your repository names.
|
6 |
-
# For a fully merged model, you typically use the model repo (and a matching tokenizer repo).
|
7 |
MODEL_NAME = "wedo2910/research_ai"
|
8 |
TOKENIZER_NAME = "wedo2910/research_ai_tok"
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
-
|
13 |
-
model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
#
|
16 |
-
|
17 |
-
model = model.to(device)
|
18 |
|
19 |
-
# Optionally set model to evaluation mode.
|
20 |
model.eval()
|
21 |
|
22 |
def single_inference(question: str, max_new_tokens: int, temperature: float) -> str:
|
@@ -25,14 +36,13 @@ def single_inference(question: str, max_new_tokens: int, temperature: float) ->
|
|
25 |
|
26 |
The prompt is constructed using a system instruction in Arabic, and the question is appended.
|
27 |
"""
|
28 |
-
# Define
|
29 |
messages = [
|
30 |
{"role": "system", "content": "اجب علي الاتي بالعربي فقط."},
|
31 |
{"role": "user", "content": question},
|
32 |
]
|
33 |
|
34 |
-
#
|
35 |
-
# If available, use it; otherwise, build a prompt manually.
|
36 |
if hasattr(tokenizer, "apply_chat_template"):
|
37 |
input_ids = tokenizer.apply_chat_template(
|
38 |
messages,
|
@@ -40,26 +50,21 @@ def single_inference(question: str, max_new_tokens: int, temperature: float) ->
|
|
40 |
return_tensors="pt"
|
41 |
).to(device)
|
42 |
else:
|
43 |
-
# Manually build the prompt
|
44 |
system_prompt = "اجب علي الاتي بالعربي فقط.\n"
|
45 |
user_prompt = f"السؤال: {question}\n"
|
46 |
full_prompt = system_prompt + user_prompt
|
47 |
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
|
48 |
|
49 |
-
# Define the terminator tokens.
|
50 |
-
# (For a merged model, usually the eos_token_id is sufficient.)
|
51 |
-
terminators = [tokenizer.eos_token_id]
|
52 |
-
|
53 |
# Generate the output.
|
54 |
outputs = model.generate(
|
55 |
input_ids,
|
56 |
max_new_tokens=max_new_tokens,
|
57 |
do_sample=True,
|
58 |
temperature=temperature,
|
59 |
-
#
|
60 |
)
|
61 |
|
62 |
-
# Remove the prompt part from the output.
|
63 |
generated_ids = outputs[0][input_ids.shape[-1]:]
|
64 |
|
65 |
# Decode the tokens into a string.
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
|
5 |
# Define your repository names.
|
|
|
6 |
MODEL_NAME = "wedo2910/research_ai"
|
7 |
TOKENIZER_NAME = "wedo2910/research_ai_tok"
|
8 |
|
9 |
+
# Check if CUDA is available and choose an appropriate device mapping.
|
10 |
+
if torch.cuda.is_available():
|
11 |
+
device = "cuda"
|
12 |
+
# When using GPU, you might let the model auto-map to available GPUs.
|
13 |
+
model = AutoModelForCausalLM.from_pretrained(
|
14 |
+
MODEL_NAME,
|
15 |
+
trust_remote_code=True,
|
16 |
+
device_map="auto"
|
17 |
+
)
|
18 |
+
else:
|
19 |
+
device = "cpu"
|
20 |
+
# Force CPU loading; this bypasses GPU-specific integrations like bitsandbytes.
|
21 |
+
model = AutoModelForCausalLM.from_pretrained(
|
22 |
+
MODEL_NAME,
|
23 |
+
trust_remote_code=True,
|
24 |
+
device_map="cpu"
|
25 |
+
)
|
26 |
|
27 |
+
# Load the tokenizer.
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, trust_remote_code=True)
|
|
|
29 |
|
30 |
+
# Optionally set the model to evaluation mode.
|
31 |
model.eval()
|
32 |
|
33 |
def single_inference(question: str, max_new_tokens: int, temperature: float) -> str:
|
|
|
36 |
|
37 |
The prompt is constructed using a system instruction in Arabic, and the question is appended.
|
38 |
"""
|
39 |
+
# Define messages for a simulated chat conversation.
|
40 |
messages = [
|
41 |
{"role": "system", "content": "اجب علي الاتي بالعربي فقط."},
|
42 |
{"role": "user", "content": question},
|
43 |
]
|
44 |
|
45 |
+
# If the tokenizer has an `apply_chat_template` method, use it; otherwise, build the prompt manually.
|
|
|
46 |
if hasattr(tokenizer, "apply_chat_template"):
|
47 |
input_ids = tokenizer.apply_chat_template(
|
48 |
messages,
|
|
|
50 |
return_tensors="pt"
|
51 |
).to(device)
|
52 |
else:
|
|
|
53 |
system_prompt = "اجب علي الاتي بالعربي فقط.\n"
|
54 |
user_prompt = f"السؤال: {question}\n"
|
55 |
full_prompt = system_prompt + user_prompt
|
56 |
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
|
57 |
|
|
|
|
|
|
|
|
|
58 |
# Generate the output.
|
59 |
outputs = model.generate(
|
60 |
input_ids,
|
61 |
max_new_tokens=max_new_tokens,
|
62 |
do_sample=True,
|
63 |
temperature=temperature,
|
64 |
+
# You can add more generation parameters if needed.
|
65 |
)
|
66 |
|
67 |
+
# Remove the prompt part from the generated output.
|
68 |
generated_ids = outputs[0][input_ids.shape[-1]:]
|
69 |
|
70 |
# Decode the tokens into a string.
|