File size: 5,471 Bytes
1b10d27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b486b3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import streamlit as st
from dotenv import load_dotenv
from audiorecorder import audiorecorder
from langchain_core.messages import HumanMessage, AIMessage
import requests
from transformers import pipeline
from gtts import gTTS
import io
from langchain_core.runnables.base import RunnableSequence
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import os
import requests
from dotenv import load_dotenv
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent
from langchain_community.tools.tavily_search import TavilySearchResults

st.set_page_config(page_title="Urdu Virtual Assistant", page_icon="🤖")  # set the page title and icon

# Load environment variables (if any)
load_dotenv()

user_id = "1"  # example user id

llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0.3,
    max_tokens=None,
    timeout=None,
    max_retries=5,
    groq_api_key=os.getenv("GROQ_API_KEY")
)

search = TavilySearchResults(
      max_results=2,
    )
tools = [search]

agent_executor = create_react_agent(llm, tools)

# Initialize the wav2vec2 model for Urdu speech-to-text
pipe = pipeline("automatic-speech-recognition", model="kingabzpro/wav2vec2-large-xls-r-300m-Urdu")

def translate(target, text):
  '''
    Translates given text into target language

    Parameters:
        target (string): 2 character code to specify the target language.
        text (string): Text to be translated.

    Returns:
        res (string): Translated text.
  '''
  url = "https://microsoft-translator-text.p.rapidapi.com/translate"

  querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target}

  payload = [{ "Text": text }]
  headers = {
    "x-rapidapi-key": os.getenv("RAPIDAPI_LANG_TRANS"),
    "x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com",
    "Content-Type": "application/json"
  }

  response = requests.post(url, json=payload, headers=headers, params=querystring)
  res = response.json()
  return res[0]["translations"][0]["text"]


def infer(user_input: str):
    '''
        Returns the translated response from the LLM in response to a user query.

        Parameters:
            user_id (string): User ID of a user.
            user_input (string): User query.

        Returns:
            res (string): Returns a translated response from the LLM.
    '''

    user_input = translate("en", user_input) # translate user query to english

    prompt = ChatPromptTemplate.from_messages( # define a prompt
        [
            (
                "system",
                "You are a compassionate and friendly AI virtual assistant. You will provide helpful answers to user queries using the provided tool to ensure the accuracy and relevance of your responses."
            ),
            ("human", "{user_input}")
        ]
    )

    runnable = prompt | agent_executor # define a chain

    conversation = RunnableSequence( # wrap the chain along with chat history and user input
        runnable,
    )

    response = conversation.invoke( # invoke the chain by giving the user input and the chat history
        {"user_input": user_input},
    )

    res = translate("ur", response["messages"][-1].content) # translate the response to Urdu
    return res


def text_to_speech(text, lang='ur'):
    '''
        Converts text to speech using gTTS.

        Parameters:
            text (string): Text to be converted to speech.
            lang (string): Language for the speech synthesis. Default is 'ur' (Urdu).
        Returns:
            response_audio_io (BytesIO): BytesIO object containing the audio data.
    '''
    tts = gTTS(text, lang=lang)
    response_audio_io = io.BytesIO()
    tts.write_to_fp(response_audio_io)
    response_audio_io.seek(0)
    return response_audio_io


col1, col2 = st.columns([1, 5])  # Adjust the ratio to control the logo and title sizes

# Display the logo in the first column
with col1:
    st.image("bolo_logo-removebg-preview.png", width=100)  # Adjust the width as needed

# Display the title in the second column
with col2:
    st.title("Urdu Virtual Assistant") # set the main title of the application
st.write("This application is a comprehensive speech-to-speech model designed to understand and respond in Urdu. It not only handles natural conversations but also has the capability to access and provide real-time information by integrating with the Tavily search engine. Whether you're asking for the weather or engaging in everyday dialogue, this assistant delivers accurate and context-aware responses, all in Urdu.")

# Add a text input box
audio = audiorecorder()

if len(audio) > 0:
    # Save the audio to a file
    audio.export("audio.wav", format="wav")

    # Convert audio to text using the wav2vec2 model
    with open("audio.wav", "rb") as f:
        audio_bytes = f.read()

    # Process the audio file
    result = pipe("audio.wav")
    user_query = result["text"]

    with st.chat_message("Human"):  # create the message box for human input
        st.audio(audio.export().read())  # display the audio player
        st.markdown(user_query)

    # Get response from the LLM
    response_text = infer(user_input=user_query)
    response_audio = text_to_speech(response_text, lang='ur')

    # Play the generated speech in the app
    with st.chat_message("AI"):
      st.audio(response_audio.read(), format='audio/mp3')
      st.markdown(response_text)