Spaces:

bofenghuang
/

vigogne-chat

Runtime error

vigogne-chat / app.py

b107fa5 over 1 year ago

12.7 kB

	#!/usr/bin/env python
	# coding=utf-8
	# Copyright 2023 Bofeng Huang

	"""
	Modified from: https://huggingface.co/spaces/mosaicml/mpt-7b-chat/raw/main/app.py

	Usage:
	CUDA_VISIBLE_DEVICES=0

	python vigogne/demo/demo_chat.py \
	--base_model_name_or_path huggyllama/llama-7b \
	--lora_model_name_or_path bofenghuang/vigogne-chat-7b
	"""

	import json

	# import datetime
	import logging
	import os
	import re
	from threading import Event, Thread
	from typing import List, Optional

	import gradio as gr

	# import requests
	import torch
	from peft import PeftModel
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	GenerationConfig,
	LlamaTokenizer,
	StoppingCriteriaList,
	TextIteratorStreamer,
	)
	from vigogne.constants import ASSISTANT, USER
	from vigogne.inference.inference_utils import StopWordsCriteria
	from vigogne.preprocess import generate_inference_chat_prompt

	# from uuid import uuid4


	logging.basicConfig(
	format="%(asctime)s [%(levelname)s] [%(name)s] %(message)s",
	datefmt="%Y-%m-%dT%H:%M:%SZ",
	)
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)

	device = "cuda" if torch.cuda.is_available() else "cpu"

	try:
	if torch.backends.mps.is_available():
	device = "mps"
	except:
	pass

	logger.info(f"Model will be loaded on device `{device}`")


	# def log_conversation(conversation_id, history, messages, generate_kwargs):
	# logging_url = os.getenv("LOGGING_URL", None)
	# if logging_url is None:
	# return

	# timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

	# data = {
	# "conversation_id": conversation_id,
	# "timestamp": timestamp,
	# "history": history,
	# "messages": messages,
	# "generate_kwargs": generate_kwargs,
	# }

	# try:
	# requests.post(logging_url, json=data)
	# except requests.exceptions.RequestException as e:
	# print(f"Error logging conversation: {e}")


	def user(message, history):
	# Append the user's message to the conversation history
	return "", history + [[message, ""]]


	# def get_uuid():
	# return str(uuid4())


	def main(
	base_model_name_or_path: str = "huggyllama/llama-7b",
	lora_model_name_or_path: str = "bofenghuang/vigogne-chat-7b",
	load_8bit: bool = False,
	server_name: Optional[str] = "0.0.0.0",
	server_port: Optional[str] = None,
	share: bool = False,
	):
	# tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path, padding_side="right", use_fast=False)
	tokenizer_class = LlamaTokenizer if "llama" in base_model_name_or_path else AutoTokenizer
	tokenizer = tokenizer_class.from_pretrained(base_model_name_or_path, padding_side="right", use_fast=False)

	if device == "cuda":
	model = AutoModelForCausalLM.from_pretrained(
	base_model_name_or_path,
	load_in_8bit=load_8bit,
	torch_dtype=torch.float16,
	device_map="auto",
	)
	model = PeftModel.from_pretrained(
	model,
	lora_model_name_or_path,
	torch_dtype=torch.float16,
	)
	elif device == "mps":
	model = AutoModelForCausalLM.from_pretrained(
	base_model_name_or_path,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	model = PeftModel.from_pretrained(
	model,
	lora_model_name_or_path,
	device_map={"": device},
	torch_dtype=torch.float16,
	)
	else:
	model = AutoModelForCausalLM.from_pretrained(base_model_name_or_path, device_map={"": device}, low_cpu_mem_usage=True)
	model = PeftModel.from_pretrained(
	model,
	lora_model_name_or_path,
	device_map={"": device},
	)

	if not load_8bit and device != "cpu":
	model.half() # seems to fix bugs for some users.

	model.eval()

	# NB
	stop_words = [f"<\|{ASSISTANT}\|>", f"<\|{USER}\|>"]
	stop_words_criteria = StopWordsCriteria(stop_words=stop_words, tokenizer=tokenizer)
	pattern_trailing_stop_words = re.compile(rf'(?:{"\|".join([re.escape(stop_word) for stop_word in stop_words])})\W*$')

	def bot(history, max_new_tokens, temperature, top_p, top_k, repetition_penalty, conversation_id=None):
	# logger.info(f"History: {json.dumps(history, indent=4, ensure_ascii=False)}")

	# Construct the input message string for the model by concatenating the current system message and conversation history
	messages = generate_inference_chat_prompt(history, tokenizer)
	logger.info(messages)
	assert messages is not None, "User input is too long!"

	# Tokenize the messages string
	input_ids = tokenizer(messages, return_tensors="pt")["input_ids"].to(device)
	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids=input_ids,
	generation_config=GenerationConfig(
	temperature=temperature,
	do_sample=temperature > 0.0,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	max_new_tokens=max_new_tokens,
	),
	streamer=streamer,
	stopping_criteria=StoppingCriteriaList([stop_words_criteria]),
	)

	# stream_complete = Event()

	def generate_and_signal_complete():
	model.generate(**generate_kwargs)
	# stream_complete.set()

	# def log_after_stream_complete():
	# stream_complete.wait()
	# log_conversation(
	# conversation_id,
	# history,
	# messages,
	# {
	# "top_k": top_k,
	# "top_p": top_p,
	# "temperature": temperature,
	# "repetition_penalty": repetition_penalty,
	# },
	# )

	t1 = Thread(target=generate_and_signal_complete)
	t1.start()

	# t2 = Thread(target=log_after_stream_complete)
	# t2.start()

	# Initialize an empty string to store the generated text
	partial_text = ""
	for new_text in streamer:
	# NB
	new_text = pattern_trailing_stop_words.sub("", new_text)

	partial_text += new_text
	history[-1][1] = partial_text
	yield history

	logger.info(f"Response: {history[-1][1]}")

	with gr.Blocks(
	theme=gr.themes.Soft(),
	css=".disclaimer {font-variant-caps: all-small-caps;}",
	) as demo:
	# conversation_id = gr.State(get_uuid)
	gr.Markdown(
	"""<h1><center>🦙 Vigogne Chat</center></h1>

	This demo is of [Vigogne-Chat-7B](https://huggingface.co/bofenghuang/vigogne-chat-7b). It's based on [LLaMA-7B](https://github.com/facebookresearch/llama) finetuned to conduct French 🇫🇷 dialogues between a user and an AI assistant.

	For more information, please visit the [Github repo](https://github.com/bofenghuang/vigogne) of the Vigogne project.
	"""
	)
	chatbot = gr.Chatbot().style(height=500)
	with gr.Row():
	with gr.Column():
	msg = gr.Textbox(
	label="Chat Message Box",
	placeholder="Chat Message Box",
	show_label=False,
	).style(container=False)
	with gr.Column():
	with gr.Row():
	submit = gr.Button("Submit")
	stop = gr.Button("Stop")
	clear = gr.Button("Clear")
	with gr.Row():
	with gr.Accordion("Advanced Options:", open=False):
	with gr.Row():
	with gr.Column():
	with gr.Row():
	max_new_tokens = gr.Slider(
	label="Max New Tokens",
	value=512,
	minimum=0,
	maximum=1024,
	step=1,
	interactive=True,
	info="The Max number of new tokens to generate.",
	)
	with gr.Column():
	with gr.Row():
	temperature = gr.Slider(
	label="Temperature",
	value=0.1,
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	interactive=True,
	info="Higher values produce more diverse outputs.",
	)
	with gr.Column():
	with gr.Row():
	top_p = gr.Slider(
	label="Top-p (nucleus sampling)",
	value=1.0,
	minimum=0.0,
	maximum=1,
	step=0.01,
	interactive=True,
	info=(
	"Sample from the smallest possible set of tokens whose cumulative probability "
	"exceeds top_p. Set to 1 to disable and sample from all tokens."
	),
	)
	with gr.Column():
	with gr.Row():
	top_k = gr.Slider(
	label="Top-k",
	value=0,
	minimum=0.0,
	maximum=200,
	step=1,
	interactive=True,
	info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",
	)
	with gr.Column():
	with gr.Row():
	repetition_penalty = gr.Slider(
	label="Repetition Penalty",
	value=1.0,
	minimum=1.0,
	maximum=2.0,
	step=0.1,
	interactive=True,
	info="Penalize repetition — 1.0 to disable.",
	)
	with gr.Row():
	gr.Markdown(
	"Disclaimer: Vigogne is still under development, and there are many limitations that have to be addressed. Please note that it is possible that the model generates harmful or biased content, incorrect information or generally unhelpful answers.",
	elem_classes=["disclaimer"],
	)
	with gr.Row():
	gr.Markdown(
	"Acknowledgements: This demo is built on top of [MPT-7B-Chat](https://huggingface.co/spaces/mosaicml/mpt-7b-chat). Thanks for their contribution!",
	elem_classes=["disclaimer"],
	)

	submit_event = msg.submit(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False,).then(
	fn=bot,
	inputs=[
	chatbot,
	max_new_tokens,
	temperature,
	top_p,
	top_k,
	repetition_penalty,
	# conversation_id,
	],
	outputs=chatbot,
	queue=True,
	)
	submit_click_event = submit.click(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=False,).then(
	fn=bot,
	inputs=[
	chatbot,
	max_new_tokens,
	temperature,
	top_p,
	top_k,
	repetition_penalty,
	# conversation_id,
	],
	outputs=chatbot,
	queue=True,
	)
	stop.click(
	fn=None,
	inputs=None,
	outputs=None,
	cancels=[submit_event, submit_click_event],
	queue=False,
	)
	clear.click(lambda: None, None, chatbot, queue=False)

	demo.queue(max_size=128, concurrency_count=2)
	demo.launch(enable_queue=True, share=share, server_name=server_name, server_port=server_port)


	main(base_model_name_or_path="decapoda-research/llama-7b-hf", lora_model_name_or_path="bofenghuang/vigogne-chat-7b")