Spaces:
Build error
Build error
import gradio as gr | |
import os | |
import time | |
# from omegaconf import OmegaConf | |
import shutil | |
import os | |
# import wget | |
import time | |
variable = [] | |
speech = "" | |
# context_2 = "" | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
import logging | |
import torch | |
import os | |
import base64 | |
from pyannote.audio import Pipeline | |
from transformers import pipeline, AutoModelForCausalLM | |
from diarization_utils import diarize | |
from huggingface_hub import HfApi | |
from pydantic import ValidationError | |
from starlette.exceptions import HTTPException | |
# from config import model_settings, InferenceConfig | |
import logging | |
from pydantic import BaseModel | |
from pydantic_settings import BaseSettings | |
from typing import Optional, Literal | |
logger = logging.getLogger(__name__) | |
class ModelSettings(BaseSettings): | |
asr_model: str | |
assistant_model: Optional[str] | |
diarization_model: Optional[str] | |
hf_token: Optional[str] | |
class InferenceConfig(BaseModel): | |
task: Literal["transcribe", "translate"] = "transcribe" | |
batch_size: int = 24 | |
assisted: bool = False | |
chunk_length_s: int = 30 | |
sampling_rate: int = 16000 | |
language: Optional[str] = None | |
num_speakers: Optional[int] = None | |
min_speakers: Optional[int] = None | |
max_speakers: Optional[int] = None | |
# from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR | |
# from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
# logger.info(f"Using device: {device.type}") | |
torch_dtype = torch.float32 if device.type == "cpu" else torch.float16 | |
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True) | |
model = AutoModel.from_pretrained("THUDM/chatglm3-6b-32k", trust_remote_code=True,device_map='auto') | |
# base_model = "lyogavin/Anima-7B-100K" | |
# tokenizer = AutoTokenizer.from_pretrained(base_model) | |
# model = AutoModelForCausalLM.from_pretrained( | |
# base_model, | |
# bnb_4bit_compute_dtype=torch.float16, | |
# # torch_dtype=torch.float16, | |
# trust_remote_code=True, | |
# device_map="auto", | |
# load_in_4bit=True | |
# ) | |
# model.eval() | |
assistant_model = AutoModelForCausalLM.from_pretrained( | |
"distil-whisper/distil-large-v3", | |
torch_dtype=torch_dtype, | |
low_cpu_mem_usage=True, | |
use_safetensors=True | |
) | |
assistant_model.to(device) | |
asr_pipeline = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-large-v3", | |
torch_dtype=torch_dtype, | |
device=device | |
) | |
HfApi().whoami(os.getenv('HF_TOKEN')) | |
diarization_pipeline = Pipeline.from_pretrained( | |
checkpoint_path="pyannote/speaker-diarization-3.1", | |
use_auth_token=os.getenv('HF_TOKEN'), | |
) | |
diarization_pipeline.to(device) | |
def upload_file(files): | |
file_paths = [file.name for file in files] | |
global variable | |
variable = file_paths | |
return file_paths | |
def audio_function(): | |
# Call the function and return its result to be displayed | |
time_1 = time.time() | |
paths = variable | |
str1 = "processed speech" | |
for i in paths: | |
str1 = str1 + i | |
str1=str1.replace("processed speech","") | |
print("before processing ffmpeg ! ") | |
command_to_mp4_to_wav = "ffmpeg -i {} current_out.wav -y" | |
#-acodec pcm_s16le -ar 16000 -ac 1 | |
os.system(command_to_mp4_to_wav.format(str1)) | |
print("after ffmpeg") | |
# os.system("insanely-fast-whisper --file-name {}_new.wav --task transcribe --hf_token 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".format(str1.replace("mp3",""))) | |
parameters = InferenceConfig() | |
generate_kwargs = { | |
"task": parameters.task, | |
"language": parameters.language, | |
"assistant_model": assistant_model if parameters.assisted else None | |
} | |
with open("current_out.wav", 'rb') as f: | |
audio_encoded = base64.b64encode(f.read()).decode("utf-8") | |
file = base64.b64decode(audio_encoded) | |
asr_outputs = asr_pipeline( | |
file, | |
chunk_length_s=parameters.chunk_length_s, | |
batch_size=parameters.batch_size, | |
generate_kwargs=generate_kwargs, | |
return_timestamps=True, | |
) | |
transcript = diarize(diarization_pipeline, file, parameters, asr_outputs) | |
global speech | |
speech = transcript | |
return transcript,asr_outputs["chunks"],asr_outputs["text"] | |
def audio_function2(): | |
# Call the function and return its result to be displayed | |
# global speech | |
str2 = speech | |
time_3 = time.time() | |
# prompt = " {} generate medical subjective objective assessment plan (soap) notes ?".format(str2) | |
prompt = """ {} "Did the technician introduce themselves at the start of the video?" | |
"Did the technician mention their level of experience during the video?" | |
"Did the technician use the customer's name during the introduction?" | |
"Did the technician mention the name of the Customer Advisor managing the booking?" | |
"Did the technician provide a personal recommendation statement in the video?" | |
"Did the technician mention service plans available to the customer?" | |
"Did the technician mention genuine Volkswagen parts during the video?" | |
"Did the technician mention the national parts and labor warranty?" | |
"Did the technician mention the 7-day price promise during the video?" | |
"Did the technician thank the customer for choosing Parkway Volkswagen?" | |
"Did the technician provide a clear NANO statement at the end of the video?" | |
"Does the video show the vehicle staged on a raised ramp?" | |
"Does the video show the area around the vehicle clean and organized?" | |
"Does the video show the vehicle’s bonnet open and upright?" | |
"Does the technician wear gloves during the video?" | |
"Does the video show protective items (e.g., seat covers, mats) being used on the vehicle?" | |
"Does the video show suitable props like a pointer or tire depth gauge being used?" | |
"Does the video show the technician starting at the nearest point of reference on the vehicle?" | |
"Does the video demonstrate the use of the Augmented Reality (AR) function?" | |
"Did the technician verbally explain the condition of at least two items?" / "Does the video show evidence of at least two items (e.g., tires, brakes) being inspected?" | |
"Did the technician explain the percentage wear of tire treads or brake pads?" / "Does the video show measurement of tire treads or brake pads?" | |
"Does the video show the technician removing a wheel to demonstrate brake condition clearly?" | |
"Did the technician provide additional context regarding brake or tire wear?" / "Does the video visually demonstrate brake or tire wear with context?" | |
"Did the technician explain the consequences of any identified repair areas?" / "Does the video show repair areas or consequences visually?" | |
"Did the technician verbally compare a new part to a worn part?" / "Does the video show a side-by-side comparison of a new part and a worn part?" | |
"Does the video include or reference supporting documents (e.g., photographs of identified items)?" """.format(str2) | |
# model = model.eval() | |
response, history = model.chat(tokenizer, prompt, history=[]) | |
print(response) | |
# del model | |
# del tokenizer | |
# torch.cuda.empty_cache() | |
time_4 = time.time() | |
# response, history = model.chat(tokenizer, "晚上睡不着应该怎么办", history=history) | |
# print(response) | |
# inputs = tokenizer(prompt, return_tensors="pt") | |
# inputs['input_ids'] = inputs['input_ids'].cuda() | |
# inputs['attention_mask'] = inputs['attention_mask'].cuda() | |
# generate_ids = model.generate(**inputs, max_new_tokens=4096, | |
# only_last_logit=True, # to save memory | |
# use_cache=False, # when run into OOM, enable this can save memory | |
# xentropy=True) | |
# output = tokenizer.batch_decode(generate_ids, | |
# skip_special_tokens=True, | |
# clean_up_tokenization_spaces=False) | |
# tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K") | |
# model = AutoModelForCausalLM.from_pretrained("togethercomputer/LLaMA-2-7B-32K", trust_remote_code=True, torch_dtype=torch.float16,device_map="auto",bnb_4bit_compute_dtype=torch.float16,load_in_4bit=True) | |
# input_context = "summarize "+" the following {}".format(str2) | |
# input_ids = tokenizer.encode(input_context, return_tensors="pt").cuda() | |
# output = model.generate(input_ids, max_new_tokens=512, temperature=0.7) | |
# output_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
# print(output_text,"wow what happened ") | |
# return output | |
return response,str(int(time_4-time_3)) + " seconds" | |
with gr.Blocks() as demo: | |
file_output = gr.File() | |
upload_button = gr.UploadButton("Click to Upload a File", file_types=["audio","video"], file_count="multiple") | |
upload_button.upload(upload_file, upload_button, file_output) | |
gr.Markdown("## Click process audio to display text from audio file") | |
submit_button = gr.Button("Process Audio") | |
output_text = gr.Textbox(label="Speech Diarization") | |
output_text_2 = gr.Textbox(label="Speech chunks") | |
submit_button.click(audio_function, outputs=[output_text,output_text_2,gr.Textbox(label=" asr_text :")]) | |
gr.Markdown("## Click the Summarize to display call summary") | |
submit_button = gr.Button("Summarize") | |
output_text = gr.Textbox(label="Sales Call Notes") | |
submit_button.click(audio_function2, outputs=[output_text,gr.Textbox(label="Time Taken :")]) | |
demo.launch() | |