Text Generation
Transformers
Safetensors
English
falcon_mamba
conversational
Inference Endpoints
ybelkada commited on
Commit
e9b7a3f
1 Parent(s): 4897148

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +37 -20
README.md CHANGED
@@ -45,11 +45,14 @@ Find below some example scripts on how to use the model in `transformers` (Make
45
  ```python
46
  from transformers import AutoTokenizer, AutoModelForCausalLM
47
 
48
- tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
49
- model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b")
50
 
51
- input_text = "Question: How many hours in one day? Answer: "
52
- input_ids = tokenizer(input_text, return_tensors="pt").input_ids
 
 
 
53
 
54
  outputs = model.generate(input_ids)
55
  print(tokenizer.decode(outputs[0]))
@@ -66,11 +69,14 @@ print(tokenizer.decode(outputs[0]))
66
  # pip install accelerate
67
  from transformers import AutoTokenizer, AutoModelForCausalLM
68
 
69
- tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
70
- model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", device_map="auto")
71
 
72
- input_text = "Question: How many hours in one day? Answer: "
73
- input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
 
 
 
74
 
75
  outputs = model.generate(input_ids)
76
  print(tokenizer.decode(outputs[0]))
@@ -87,13 +93,16 @@ print(tokenizer.decode(outputs[0]))
87
  import torch
88
  from transformers import AutoTokenizer, AutoModelForCausalLM
89
 
90
- tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
91
- model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", torch_dtype=torch.bfloat16).to(0)
92
 
93
  model = torch.compile(model)
94
 
95
- input_text = "Question: How many hours in one day? Answer: "
96
- input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
 
 
 
97
 
98
  outputs = model.generate(input_ids)
99
  print(tokenizer.decode(outputs[0]))
@@ -114,11 +123,14 @@ print(tokenizer.decode(outputs[0]))
114
  import torch
115
  from transformers import AutoTokenizer, AutoModelForCausalLM
116
 
117
- tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
118
- model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", device_map="auto", torch_dtype=torch.float16)
119
 
120
- input_text = "Question: How many hours in one day? Answer: "
121
- input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
 
 
 
122
 
123
  outputs = model.generate(input_ids)
124
  print(tokenizer.decode(outputs[0]))
@@ -135,11 +147,14 @@ print(tokenizer.decode(outputs[0]))
135
  # pip install bitsandbytes accelerate
136
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
137
 
138
- tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b")
139
- model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b", device_map="auto", quantization_config=BitsAndBytesConfig(load_in_4bit=True))
140
 
141
- input_text = "Question: How many hours in one day? Answer: "
142
- input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
 
 
 
143
 
144
  outputs = model.generate(input_ids)
145
  print(tokenizer.decode(outputs[0]))
@@ -164,6 +179,8 @@ In particular, we used samples coming from [Fineweb-edu](https://huggingface.co/
164
 
165
  The data was tokenized with the Falcon-[7B](https://huggingface.co/tiiuae/falcon-7B)/[11B](https://huggingface.co/tiiuae/falcon-11B) tokenizer.
166
 
 
 
167
  ## Training Procedure
168
  Falcon-Mamba-7B was trained on 256 H100 80GB GPUs for the majority of the training, using a 3D parallelism strategy (TP=1, PP=1, DP=256) combined with ZeRO.
169
 
 
45
  ```python
46
  from transformers import AutoTokenizer, AutoModelForCausalLM
47
 
48
+ tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
49
+ model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
50
 
51
+ # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
52
+ messages = [
53
+ {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
54
+ ]
55
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).input_ids
56
 
57
  outputs = model.generate(input_ids)
58
  print(tokenizer.decode(outputs[0]))
 
69
  # pip install accelerate
70
  from transformers import AutoTokenizer, AutoModelForCausalLM
71
 
72
+ tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
73
+ model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b-instruct", device_map="auto")
74
 
75
+ # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
76
+ messages = [
77
+ {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
78
+ ]
79
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).input_ids.to("cuda")
80
 
81
  outputs = model.generate(input_ids)
82
  print(tokenizer.decode(outputs[0]))
 
93
  import torch
94
  from transformers import AutoTokenizer, AutoModelForCausalLM
95
 
96
+ tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
97
+ model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b-instruct", torch_dtype=torch.bfloat16).to(0)
98
 
99
  model = torch.compile(model)
100
 
101
+ # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
102
+ messages = [
103
+ {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
104
+ ]
105
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).input_ids.to("cuda")
106
 
107
  outputs = model.generate(input_ids)
108
  print(tokenizer.decode(outputs[0]))
 
123
  import torch
124
  from transformers import AutoTokenizer, AutoModelForCausalLM
125
 
126
+ tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
127
+ model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b-instruct", device_map="auto", torch_dtype=torch.float16)
128
 
129
+ # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
130
+ messages = [
131
+ {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
132
+ ]
133
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).input_ids.to("cuda")
134
 
135
  outputs = model.generate(input_ids)
136
  print(tokenizer.decode(outputs[0]))
 
147
  # pip install bitsandbytes accelerate
148
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
149
 
150
+ tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-mamba-7b-instruct")
151
+ model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-7b-instruct", device_map="auto", quantization_config=BitsAndBytesConfig(load_in_4bit=True))
152
 
153
+ # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
154
+ messages = [
155
+ {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
156
+ ]
157
+ input_ids = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).input_ids.to("cuda")
158
 
159
  outputs = model.generate(input_ids)
160
  print(tokenizer.decode(outputs[0]))
 
179
 
180
  The data was tokenized with the Falcon-[7B](https://huggingface.co/tiiuae/falcon-7B)/[11B](https://huggingface.co/tiiuae/falcon-11B) tokenizer.
181
 
182
+ After pre-training, the model has been further fine-tuned on instruction data.
183
+
184
  ## Training Procedure
185
  Falcon-Mamba-7B was trained on 256 H100 80GB GPUs for the majority of the training, using a 3D parallelism strategy (TP=1, PP=1, DP=256) combined with ZeRO.
186