limitedonly41 commited on
Commit
b0da584
·
verified ·
1 Parent(s): 410c398

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -17
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- from unsloth import FastLanguageModel
3
  import torch
4
 
5
  # Configuration
@@ -9,24 +8,39 @@ load_in_4bit = True # Use 4-bit quantization to reduce memory usage
9
 
10
  peft_model_name = "limitedonly41/website_mistral7b_v02_1200_finetuned_5_big"
11
 
12
- # Load the model and tokenizer
13
- model, tokenizer = FastLanguageModel.from_pretrained(
14
- model_name=peft_model_name, # YOUR MODEL YOU USED FOR TRAINING
15
- max_seq_length=max_seq_length,
16
- dtype=dtype,
17
- load_in_4bit=load_in_4bit,
18
- )
19
- FastLanguageModel.for_inference(model) # Enable native 2x faster inference
20
 
21
- def return_prediction(prompt):
22
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
23
- outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
24
- ans = tokenizer.batch_decode(outputs)[0]
25
- ans_pred = ans.split('### Response:')[1].split('<')[0]
26
- return ans_pred
 
 
 
 
 
 
 
 
 
27
 
28
- @spaces.GPU
29
  def classify_website(site_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
31
 
32
  ### Instruction:
@@ -40,7 +54,14 @@ Categorize the website into one of the 3 categories:
40
  {site_text}
41
 
42
  ### Response:"""
43
- return return_prediction(prompt)
 
 
 
 
 
 
 
44
 
45
  # Create a Gradio interface
46
  iface = gr.Interface(
 
1
  import gradio as gr
 
2
  import torch
3
 
4
  # Configuration
 
8
 
9
  peft_model_name = "limitedonly41/website_mistral7b_v02_1200_finetuned_5_big"
10
 
11
+ # from unsloth import FastLanguageModel
 
 
 
 
 
 
 
12
 
13
+ # # Load the model and tokenizer
14
+ # model, tokenizer = FastLanguageModel.from_pretrained(
15
+ # model_name=peft_model_name, # YOUR MODEL YOU USED FOR TRAINING
16
+ # max_seq_length=max_seq_length,
17
+ # dtype=dtype,
18
+ # load_in_4bit=load_in_4bit,
19
+ # )
20
+ # FastLanguageModel.for_inference(model) # Enable native 2x faster inference
21
+
22
+ # def return_prediction(prompt):
23
+ # inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
24
+ # outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
25
+ # ans = tokenizer.batch_decode(outputs)[0]
26
+ # ans_pred = ans.split('### Response:')[1].split('<')[0]
27
+ # return ans_pred
28
 
29
+ @spaces.GPU()
30
  def classify_website(site_text):
31
+
32
+ from unsloth import FastLanguageModel
33
+
34
+ # Load the model and tokenizer
35
+ model, tokenizer = FastLanguageModel.from_pretrained(
36
+ model_name=peft_model_name, # YOUR MODEL YOU USED FOR TRAINING
37
+ max_seq_length=max_seq_length,
38
+ dtype=dtype,
39
+ load_in_4bit=load_in_4bit,
40
+ )
41
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
42
+
43
+
44
  prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
45
 
46
  ### Instruction:
 
54
  {site_text}
55
 
56
  ### Response:"""
57
+
58
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
59
+ outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
60
+ ans = tokenizer.batch_decode(outputs)[0]
61
+ ans_pred = ans.split('### Response:')[1].split('<')[0]
62
+ return ans_pred
63
+
64
+ # return return_prediction(prompt)
65
 
66
  # Create a Gradio interface
67
  iface = gr.Interface(