Spaces:

TeamTonic
/

Qwen-Audio-Chat

Paused

App Files Files Community

Qwen-Audio-Chat / app.py

Tonic

Update app.py

3948c4b 12 months ago

raw

history blame

6.53 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import os
	import copy
	import re
	import secrets
	from pathlib import Path
	from pydub import AudioSegment

	# Initialize the model and tokenizer
	torch.manual_seed(420)
	tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-Audio-Chat", trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-Audio-Chat", device_map="cuda", trust_remote_code=True).eval()

	def _parse_text(text):
	lines = text.split("\n")
	lines = [line for line in lines if line != ""]
	count = 0
	for i, line in enumerate(lines):
	if "```" in line:
	count += 1
	items = line.split("`")
	if count % 2 == 1:
	lines[i] = f'<pre><code class="language-{items[-1]}">'
	else:
	lines[i] = f"<br></code></pre>"
	else:
	if i > 0:
	if count % 2 == 1:
	line = line.replace("`", r"\`")
	line = line.replace("<", "<")
	line = line.replace(">", ">")
	line = line.replace(" ", " ")
	line = line.replace("*", "&ast;")
	line = line.replace("_", "&lowbar;")
	line = line.replace("-", "-")
	line = line.replace(".", ".")
	line = line.replace("!", "!")
	line = line.replace("(", "(")
	line = line.replace(")", ")")
	line = line.replace("$", "$")
	lines[i] = "<br>" + line
	text = "".join(lines)
	return text

	def predict(_chatbot, task_history, user_input):
	print("Predict - Start: task_history =", task_history)
	if not isinstance(task_history, list) or not all(isinstance(item, tuple) and len(item) == 2 for item in task_history):
	print("Error: task_history should be a list of tuples of length 2.")
	return _chatbot

	query = user_input if user_input else (task_history[-1][0] if task_history else "")
	print("User: " + _parse_text(query))

	if not task_history:
	return _chatbot

	history_cp = copy.deepcopy(task_history)
	history_filter = []
	audio_idx = 1
	pre = ""
	last_audio = None

	for item in history_cp:
	q, a = item
	if isinstance(q, (tuple, list)):
	last_audio = q[0]
	q = f'Audio {audio_idx}: <audio>{q[0]}</audio>'
	pre += q + '\n'
	audio_idx += 1
	else:
	pre += q
	history_filter.append((pre, a))
	pre = ""
	if not history_filter:
	return _chatbot
	history, message = history_filter[:-1], history_filter[-1][0]
	response, history = model.chat(tokenizer, message, history=history)
	ts_pattern = r"<\\|\d{1,2}\.\d+\\|>"
	all_time_stamps = re.findall(ts_pattern, response)
	if (len(all_time_stamps) > 0) and (len(all_time_stamps) % 2 ==0) and last_audio:
	ts_float = [ float(t.replace("<\|","").replace("\|>","")) for t in all_time_stamps]
	ts_float_pair = [ts_float[i:i + 2] for i in range(0,len(all_time_stamps),2)]
	# 读取音频文件
	format = os.path.splitext(last_audio)[-1].replace(".","")
	audio_file = AudioSegment.from_file(last_audio, format=format)
	chat_response_t = response.replace("<\|", "").replace("\|>", "")
	chat_response = chat_response_t
	temp_dir = secrets.token_hex(20)
	temp_dir = Path(uploaded_file_dir) / temp_dir
	temp_dir.mkdir(exist_ok=True, parents=True)
	# 截取音频文件
	for pair in ts_float_pair:
	audio_clip = audio_file[pair[0] * 1000: pair[1] * 1000]
	# 保存音频文件
	name = f"tmp{secrets.token_hex(5)}.{format}"
	filename = temp_dir / name
	audio_clip.export(filename, format=format)
	_chatbot[-1] = (_parse_text(query), chat_response)
	_chatbot.append((None, (str(filename),)))
	if not _chatbot:
	_chatbot = [("", "")]

	print("Predict - End: task_history =", task_history)
	return _chatbot[-1][1], _chatbot


	def regenerate(_chatbot, task_history):
	print("Regenerate - Start: task_history =", task_history)
	if not task_history:
	return _chatbot
	item = task_history[-1]
	if item[1] is None:
	return _chatbot
	task_history[-1] = (item[0], None)
	chatbot_item = _chatbot.pop(-1)
	if chatbot_item[0] is None:
	_chatbot[-1] = (_chatbot[-1][0], None)
	else:
	_chatbot.append((chatbot_item[0], None))
	print("Regenerate - End: task_history =", task_history)
	return predict(_chatbot, task_history)

	def add_text(history, task_history, text):
	print("Add Text - Before: task_history =", task_history)
	if not isinstance(task_history, list):
	task_history = []
	history.append((_parse_text(text), None))
	task_history.append((text, None))
	print("Add Text - After: task_history =", task_history)
	return history, task_history

	def add_file(history, task_history, file):
	print("Add File - Before: task_history =", task_history)
	history.append(((file.name,), None))
	task_history.append(((file.name,), None))
	print("Add File - After: task_history =", task_history)
	return history, task_history

	def add_mic(history, task_history, file):
	print("Add Mic - Before: task_history =", task_history)
	if file is None:
	return history, task_history
	file_with_extension = file + '.wav'
	os.rename(file, file_with_extension)
	history.append(((file_with_extension,), None))
	task_history.append(((file_with_extension,), None))
	print("Add Mic - After: task_history =", task_history)
	return history, task_history

	def reset_user_input():
	return gr.update(value="")

	def reset_state(task_history):
	print("Reset State - Before: task_history =", task_history)
	task_history = []
	print("Reset State - After: task_history =", task_history)
	return []

	iface = gr.Interface(
	fn=predict,
	inputs=[
	gr.Audio(label="Audio Input"),
	gr.Textbox(label="Text Query"),
	gr.State()
	],
	outputs=[
	"text",
	gr.State()
	],
	title="Audio-Text Interaction Model",
	description="This model can process an audio input along with a text query and provide a response.",
	theme="default",
	allow_flagging="never"
	)

	iface.launch()