import streamlit as st from st_audiorec import st_audiorec from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline #from datasets import load_dataset import torch from gliner import GLiNER from resources import Lead_Labels, entity_labels, set_start, audit_elapsedtime rec = None ner = None iteration = 0 def main (): print(f"Main iteration {iteration}") iteration += 1 if rec is None: print("rec is None") rec = init_model_trans() if ner is None: print("ner is None") ner = init_model_ner() #async labels = entity_labels text = "I have a proposal from cgd where they want one outsystems junior developers and one senior for an estimate of three hundred euros a day, for six months." print(f"get entities from sample text: {text}") get_entity_labels(model=ner, text=text, labels=labels) print("Render UI") wav_audio_data = st_audiorec() if wav_audio_data is not None and rec is not None: print("Loading data...") st.audio(wav_audio_data, format='audio/wav') text = transcribe(wav_audio_data, rec) if text is not None: get_entity_labels(labels=labels, model=ner, text=text) def init_model_trans (): print("Initiating transcription model...") start = set_start() device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch_dtype, device=device, ) print(f'Init model successful: {model}' ) audit_elapsedtime(function="Initiating transcription model", start=start) return pipe def init_model_ner(): print("Initiating NER model...") start = set_start() model = GLiNER.from_pretrained("urchade/gliner_multi") audit_elapsedtime(function="Initiating NER model", start=start) return model def transcribe (audio_sample: bytes, pipe) -> str: start = set_start() # dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation") # sample = dataset[0]["audio"] result = pipe(audio_sample) audit_elapsedtime(function="Transcription", start=start) print(result) st.write('trancription: ', result["text"]) return result["text"] def get_entity_labels(model: GLiNER, text: str, labels: list): #-> Lead_labels: start = set_start() entities = model.predict_entities(text, labels) audit_elapsedtime(function="Retreiving entity labels from text", start=start) for entity in entities: print(entity["text"], "=>", entity["label"]) st.write('Entities: ', entities) # return Lead_Labels() if __name__ == "__main__": print("IN __name__") main()