Spaces:

Samarth991
/

LLAMA-QA-AudioFiles

Sleeping

App Files Files Community

Samarth991 commited on Sep 28, 2023

Commit

2faf743

•

1 Parent(s): 19b1878

application to run llama-7b on Audio files

Browse files

Files changed (4) hide show

app.py +177 -0
llm_ops.py +21 -0
requirements.txt +12 -0
whisper_app.py +69 -0

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import time
+import gradio as gr
+import logging
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import SentenceTransformerEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.docstore.document import Document
+from whisper_app import WHISPERModel
+import llm_ops
+FILE_EXT = ['wav','mp3']
+MAX_NEW_TOKENS = 4096
+DEFAULT_MAX_NEW_TOKENS = 1024
+DEFAULT_TEMPERATURE = 0.1
+def create_logger():
+    formatter = logging.Formatter('%(asctime)s:%(levelname)s:- %(message)s')
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(formatter)
+    logger = logging.getLogger("APT_Realignment")
+    logger.setLevel(logging.INFO)
+    if not logger.hasHandlers():
+        logger.addHandler(console_handler)
+    logger.propagate = False
+    return logger
+def create_prompt():
+    prompt_template = """Asnwer the questions regarding the content in the Audio .
+    Use the following context to answer.
+    If you don't know the answer, just say I don't know.
+    {context}
+    Question: {question}
+    Answer :"""
+    prompt = PromptTemplate(
+        template=prompt_template, input_variables=["context", "question"]
+    )
+    return prompt
+logger = create_logger()
+def process_documents(documents,data_chunk=1500,chunk_overlap=100):
+    text_splitter = CharacterTextSplitter(chunk_size=data_chunk, chunk_overlap=chunk_overlap,separator='\n')
+    texts = text_splitter.split_documents(documents)
+    return texts
+def audio_processor(wav_file,API_key,wav_model='small',llm='HuggingFace',temperature=0.1,max_tokens=4096):
+    device='cpu'
+    logger.info("Loading Whsiper Model || Model size:{}".format(wav_model))
+    whisper = WHISPERModel(model_name=wav_model,device=device)
+    text_info = whisper.speech_to_text(audio_path=wav_file)
+    metadata = {"source": f"{wav_file}","duration":text_info['duration'],"language":text_info['language']}
+    document = [Document(page_content=text_info['text'], metadata=metadata)]
+    logger.info("Document",document)
+    logging.info("Loading General Text Embeddings (GTE) model{}".format('thenlper/gte-large'))
+    embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large',model_kwargs={"device": device})
+    texts = process_documents(documents=document)
+    global vector_db
+    vector_db = FAISS.from_documents(documents=texts, embedding= embedding_model)
+    global qa
+    if llm == 'HuggingFace':
+        chat = llm_ops.get_hugging_face_model(
+                            model_id="meta-llama/Llama-2-7b",
+                            API_key=API_key,
+                            temperature=temperature,
+                            max_tokens=max_tokens
+                            )
+    else:
+        chat = llm_ops.get_openai_chat_model(API_key=API_key)
+    chain_type_kwargs = {"prompt": create_prompt()}
+    qa = RetrievalQA.from_chain_type(llm=chat,
+                                chain_type='stuff',
+                                retriever=vector_db.as_retriever(),
+                                chain_type_kwargs=chain_type_kwargs,
+                                return_source_documents=True
+                            )
+    return "Audio Processing completed ..."
+def infer(question, history):
+    # res = []
+    # for human, ai in history[:-1]:
+    #     pair = (human, ai)
+    #     res.append(pair)
+    # chat_history = res
+    result = qa({"query": question})
+    matching_docs_score = vector_db.similarity_search_with_score(question)
+    logger.info("Matching Score :",matching_docs_score)
+    return result["result"]
+def bot(history):
+    response = infer(history[-1][0], history)
+    history[-1][1] = ""
+    for character in response:
+        history[-1][1] += character
+        time.sleep(0.05)
+        yield history
+def add_text(history, text):
+    history = history + [(text, None)]
+    return history, ""
+def loading_file():
+    return "Loading..."
+css="""
+#col-container {max-width: 2048px; margin-left: auto; margin-right: auto;}
+"""
+title = """
+<div style="text-align: center;max-width: 2048px;">
+    <h1>Chat with Youtube Videos </h1>
+    <p style="text-align: center;">Upload a youtube link of any video-lecture/song/Research/Conference & ask Questions to chatbot with the tool.
+    <i> Tools uses State of the Art Models from  HuggingFace/OpenAI so, make sure to add your key.</i>
+    </p>
+</div>
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Row():
+        with gr.Column(elem_id="col-container"):
+            gr.HTML(title)
+    with gr.Column():
+        with gr.Row():
+            LLM_option = gr.Dropdown(['HuggingFace','OpenAI'],label='Select HuggingFace/OpenAI')
+            API_key = gr.Textbox(label="Add API key", type="password",autofocus=True)
+            wav_model = gr.Dropdown(['small','medium','large'],label='Select Whisper model')
+    with gr.Group():
+        chatbot = gr.Chatbot(height=270)
+    with gr.Row():
+        question = gr.Textbox(label="Type your question !",lines=1).style(full_width=True)
+    with gr.Row():
+        submit_btn = gr.Button(value="Send message", variant="primary", scale = 1)
+        clean_chat_btn =  gr.Button("Delete Chat")
+    with gr.Column():
+        with gr.Box():
+            audio_file = gr.File(label="Upload Audio File ", file_types=FILE_EXT, type="file")
+            with gr.Accordion(label='Advanced options', open=False):
+                    max_new_tokens = gr.Slider(
+                        label='Max new tokens',
+                        minimum=2048,
+                        maximum=MAX_NEW_TOKENS,
+                        step=1,
+                        value=DEFAULT_MAX_NEW_TOKENS,
+                        )
+                    temperature = gr.Slider(
+                    label='Temperature',
+                    minimum=0.1,
+                    maximum=4.0,
+                    step=0.1,
+                    value=DEFAULT_TEMPERATURE,
+                    )
+            with gr.Row():
+                    langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False)
+                    load_audio = gr.Button("Upload Audio File",).style(full_width = False)
+    if audio_file:
+        load_audio.click(loading_file, None, langchain_status, queue=False)
+        load_audio.click(audio_processor, inputs=[audio_file,API_key,wav_model,LLM_option,temperature,max_new_tokens], outputs=[langchain_status], queue=False)

llm_ops.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+def get_openai_chat_model(API_key):
+    try:
+        from langchain.llms import OpenAI
+    except ImportError as err:
+        raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
+    os.environ["OPENAI_API_KEY"] = API_key
+    llm = OpenAI()
+    return llm
+def get_hugging_face_model(model_id,API_key,temperature=0.1,max_tokens=4096):
+    try:
+        from langchain import HuggingFaceHub
+    except ImportError as err:
+        raise "{}, unable to load openAI. Please install openai and add OPENAIAPI_KEY"
+    chat_llm = HuggingFaceHub(huggingfacehub_api_token=API_key,
+                                      repo_id=model_id,
+                                      model_kwargs={"temperature": temperature, "max_new_tokens": max_tokens})
+    return chat_llm

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+openai
+tiktoken
+chromadb
+langchain
+unstructured
+unstructured[local-inference]
+transformers
+torch
+faiss-cpu
+sentence-transformers
+youtube-transcript-api
+whisper

whisper_app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import torch as th
+import whisper
+from whisper.audio import SAMPLE_RATE
+from tenacity import retry, wait_random
+import openai
+import requests
+import time
+# os.environ['OPENAI_API_KEY'] = "sk-<API KEY>"
+class WHISPERModel:
+    def __init__(self, model_name='small', device='cuda',openai_flag=False):
+        self.device = device
+        self.openai_flag = openai_flag
+        self.model = whisper.load_model(model_name, device=self.device)
+    def get_info(self, audio_data, conv_duration=30):
+        clip_audio = whisper.pad_or_trim(audio_data, length=SAMPLE_RATE * conv_duration)
+        result = self.model.transcribe(clip_audio)
+        return result['language']
+    def speech_to_text(self, audio_path):
+        self.logger.info("Reading url {}".format(audio_path))
+        text_data = dict()
+        audio_duration = 0
+        conv_language = ""
+        r = requests.get(audio_path)
+        if r.status_code == 200:
+            try:
+                audio = whisper.load_audio(audio_path)
+                conv_language = self.get_info(audio)
+                if conv_language !='en':
+                    res = self.model.transcribe(audio,task='translate')
+                    if self.openai_flag:
+                        res['text'] = self.translate_text(res['text'], orginal_text=conv_language, convert_to='English')
+                else:
+                    res = self.model.transcribe(audio)
+                audio_duration = audio.shape[0] / SAMPLE_RATE
+                text_data['text'] = res['text']
+                text_data['duration'] = audio_duration
+                text_data['language'] = conv_language
+            except IOError as err:
+                raise f"Issue in loading audio {audio_path}"
+        else:
+            raise("Unable to reach for URL {}".format(audio_path))
+        return text_data
+    @retry(wait=wait_random(min=5, max=10))
+    def translate_text(self, text, orginal_text='ar', convert_to='english'):
+        prompt = f'Translate the following {orginal_text} text to {convert_to}:\n\n{orginal_text}: ' + text + '\n{convert_to}:'
+        # Generate response using ChatGPT
+        response = openai.Completion.create(
+            engine='text-davinci-003',
+            prompt=prompt,
+            max_tokens=100,
+            n=1,
+            stop=None,
+            temperature=0.7
+        )
+        # Extract the translated English text from the response
+        translation = response.choices[0].text.strip()
+        return translation
+if __name__ == '__main__':
+    url = "https://prypto-api.aswat.co/surveillance/recordings/5f53c28b-3504-4b8b-9db5-0c8b69a96233.mp3"
+    audio2text = WHISPERModel()
+    text = audio2text.speech_to_text(url)