Spaces:

jackbond2024
/

glm4

Runtime error

App Files Files Community

glm4 / trans_cli_demo.py

jackbond2024

Upload folder using huggingface_hub

416da8e verified 6 months ago

raw

history blame contribute delete

4.23 kB

	"""
	This script creates a CLI demo with transformers backend for the glm-4-9b model,
	allowing users to interact with the model through a command-line interface.

	Usage:
	- Run the script to start the CLI demo.
	- Interact with the model by typing questions and receiving responses.

	Note: The script includes a modification to handle markdown to plain text conversion,
	ensuring that the CLI interface displays formatted text correctly.

	If you use flash attention, you should install the flash-attn and add attn_implementation="flash_attention_2" in model loading.
	"""

	import os
	import torch
	from threading import Thread
	from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer, AutoModel

	MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/glm-4-9b-chat')
	MODEL_PATH = "/Users/zmac/Documents/opensrc/llms/GLM-4/models"
	print("MODEL_PATH: " + MODEL_PATH)

	## If use peft model.
	# def load_model_and_tokenizer(model_dir, trust_remote_code: bool = True):
	# if (model_dir / 'adapter_config.json').exists():
	# model = AutoModel.from_pretrained(
	# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
	# )
	# tokenizer_dir = model.peft_config['default'].base_model_name_or_path
	# else:
	# model = AutoModel.from_pretrained(
	# model_dir, trust_remote_code=trust_remote_code, device_map='auto'
	# )
	# tokenizer_dir = model_dir
	# tokenizer = AutoTokenizer.from_pretrained(
	# tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False
	# )
	# return model, tokenizer


	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True,
	encode_special_tokens=True
	)

	model = AutoModel.from_pretrained(
	MODEL_PATH,
	trust_remote_code=True,
	# attn_implementation="flash_attention_2", # Use Flash Attention
	# torch_dtype=torch.bfloat16, #using flash-attn must use bfloat16 or float16
	device_map="auto").eval()


	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	stop_ids = model.config.eos_token_id
	for stop_id in stop_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False


	if __name__ == "__main__":
	history = []
	max_length = 8192
	top_p = 0.8
	temperature = 0.6
	stop = StopOnTokens()

	print("Welcome to the GLM-4-9B CLI chat. Type your messages below.")
	while True:
	user_input = input("\nYou: ")
	if user_input.lower() in ["exit", "quit"]:
	break
	history.append([user_input, ""])

	messages = []
	for idx, (user_msg, model_msg) in enumerate(history):
	if idx == len(history) - 1 and not model_msg:
	messages.append({"role": "user", "content": user_msg})
	break
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if model_msg:
	messages.append({"role": "assistant", "content": model_msg})
	model_inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_tensors="pt"
	).to(model.device)
	streamer = TextIteratorStreamer(
	tokenizer=tokenizer,
	timeout=60,
	skip_prompt=True,
	skip_special_tokens=True
	)
	generate_kwargs = {
	"input_ids": model_inputs,
	"streamer": streamer,
	"max_new_tokens": max_length,
	"do_sample": True,
	"top_p": top_p,
	"temperature": temperature,
	"stopping_criteria": StoppingCriteriaList([stop]),
	"repetition_penalty": 1.2,
	"eos_token_id": model.config.eos_token_id,
	}
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()
	print("GLM-4:", end="", flush=True)
	for new_token in streamer:
	if new_token:
	print(new_token, end="", flush=True)
	history[-1][1] += new_token

	history[-1][1] = history[-1][1].strip()