Spaces:
Sleeping
Sleeping
to cpu/ using transformers
Browse files- inference.py +25 -17
inference.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
-
from transformers import TextStreamer
|
2 |
from typing import Tuple, List, Dict
|
3 |
import torch
|
4 |
-
from unsloth import FastLanguageModel
|
5 |
|
6 |
def load_model(
|
7 |
model_name: str,
|
8 |
max_seq_length: int,
|
9 |
dtype: torch.dtype,
|
10 |
load_in_4bit: bool
|
11 |
-
) -> Tuple[
|
12 |
"""
|
13 |
Load and initialize the language model for inference.
|
14 |
|
@@ -21,13 +21,18 @@ def load_model(
|
|
21 |
Returns:
|
22 |
Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
|
23 |
"""
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
model_name=model_name,
|
26 |
-
|
27 |
-
|
28 |
-
load_in_4bit=load_in_4bit,
|
29 |
)
|
30 |
-
|
|
|
|
|
31 |
return model, tokenizer
|
32 |
|
33 |
def prepare_input(
|
@@ -54,7 +59,7 @@ def prepare_input(
|
|
54 |
).to(device)
|
55 |
|
56 |
def generate_response(
|
57 |
-
model:
|
58 |
inputs: torch.Tensor,
|
59 |
tokenizer: any,
|
60 |
max_new_tokens: int = 2000,
|
@@ -77,16 +82,19 @@ def generate_response(
|
|
77 |
Returns:
|
78 |
str: Generated response
|
79 |
"""
|
80 |
-
text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
|
|
|
81 |
outputs = model.generate(
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
88 |
)
|
89 |
-
|
|
|
90 |
|
91 |
def main(
|
92 |
USER_INPUT_CODE = "program sum_of_numbers\n implicit none\n integer :: n, i, sum\n\n ! Initialize variables\n sum = 0\n\n ! Get user input\n print *, \"Enter a positive integer:\"\n read *, n\n\n ! Calculate the sum of numbers from 1 to n\n do i = 1, n\n sum = sum + i\n end do\n\n ! Print the result\n print *, \"The sum of numbers from 1 to\", n, \"is\", sum\nend program sum_of_numbers",
|
|
|
1 |
+
from transformers import TextStreamer, AutoModelForCausalLM, AutoTokenizer
|
2 |
from typing import Tuple, List, Dict
|
3 |
import torch
|
4 |
+
# from unsloth import FastLanguageModel
|
5 |
|
6 |
def load_model(
|
7 |
model_name: str,
|
8 |
max_seq_length: int,
|
9 |
dtype: torch.dtype,
|
10 |
load_in_4bit: bool
|
11 |
+
) -> Tuple[AutoModelForCausalLM, any]:
|
12 |
"""
|
13 |
Load and initialize the language model for inference.
|
14 |
|
|
|
21 |
Returns:
|
22 |
Tuple[FastLanguageModel, any]: Tuple containing the model and tokenizer
|
23 |
"""
|
24 |
+
model_name = "lora_model"
|
25 |
+
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
27 |
+
|
28 |
+
model = AutoModelForCausalLM.from_pretrained(
|
29 |
model_name=model_name,
|
30 |
+
torch_dtype=dtype,
|
31 |
+
device_map="auto"
|
|
|
32 |
)
|
33 |
+
|
34 |
+
model.eval() # Set model to evaluation mode
|
35 |
+
|
36 |
return model, tokenizer
|
37 |
|
38 |
def prepare_input(
|
|
|
59 |
).to(device)
|
60 |
|
61 |
def generate_response(
|
62 |
+
model: AutoModelForCausalLM,
|
63 |
inputs: torch.Tensor,
|
64 |
tokenizer: any,
|
65 |
max_new_tokens: int = 2000,
|
|
|
82 |
Returns:
|
83 |
str: Generated response
|
84 |
"""
|
85 |
+
# text_streamer = TextStreamer(tokenizer, skip_prompt=skip_prompt)
|
86 |
+
inputs = tokenizer(inputs, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
|
87 |
outputs = model.generate(
|
88 |
+
**inputs,
|
89 |
+
max_length=2000
|
90 |
+
# streamer=text_streamer,
|
91 |
+
# max_new_tokens=max_new_tokens,
|
92 |
+
# use_cache=True,
|
93 |
+
# temperature=temperature,
|
94 |
+
# min_p=min_p
|
95 |
)
|
96 |
+
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
97 |
+
return generated_text
|
98 |
|
99 |
def main(
|
100 |
USER_INPUT_CODE = "program sum_of_numbers\n implicit none\n integer :: n, i, sum\n\n ! Initialize variables\n sum = 0\n\n ! Get user input\n print *, \"Enter a positive integer:\"\n read *, n\n\n ! Calculate the sum of numbers from 1 to n\n do i = 1, n\n sum = sum + i\n end do\n\n ! Print the result\n print *, \"The sum of numbers from 1 to\", n, \"is\", sum\nend program sum_of_numbers",
|