rphrp1985 commited on
Commit
d972151
1 Parent(s): 596a439

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -7
app.py CHANGED
@@ -26,16 +26,11 @@ model_id = "CohereForAI/c4ai-command-r-plus-4bit"
26
 
27
  tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
28
 
29
- # model = AutoModelForCausalLM.from_pretrained(model_id, token= token, torch_dtype=torch.bfloat16,
30
- # # attn_implementation="flash_attention_2",
31
- # # low_cpu_mem_usage=True,
32
- # llm_int8_enable_fp32_cpu_offload=True,
33
- # device_map="auto"
34
- # )
35
 
36
 
37
 
38
- @spaces.GPU(duration=180)
39
  def respond(
40
  message,
41
  history: list[tuple[str, str]],
@@ -44,6 +39,13 @@ def respond(
44
  temperature,
45
  top_p,
46
  ):
 
 
 
 
 
 
 
47
  messages = [{"role": "user", "content": "Hello, how are you?"}]
48
  input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
49
  ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
 
26
 
27
  tokenizer = AutoTokenizer.from_pretrained(model_id, token= token)
28
 
29
+ #
 
 
 
 
 
30
 
31
 
32
 
33
+ @spaces.GPU(duration=300)
34
  def respond(
35
  message,
36
  history: list[tuple[str, str]],
 
39
  temperature,
40
  top_p,
41
  ):
42
+ model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
43
+ # torch_dtype=torch.bfloat16,
44
+ # attn_implementation="flash_attention_2",
45
+ # low_cpu_mem_usage=True,
46
+ # llm_int8_enable_fp32_cpu_offload=True,
47
+ device_map="auto"
48
+ )
49
  messages = [{"role": "user", "content": "Hello, how are you?"}]
50
  input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
51
  ## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>