Spaces:
Paused
Paused
from llama_index.core.response.notebook_utils import display_source_node | |
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding | |
from llama_index.core.query_engine import RetrieverQueryEngine | |
from llama_index.core import VectorStoreIndex, ServiceContext | |
from llama_index.core.node_parser import SimpleNodeParser | |
from llama_index.llms.azure_openai import AzureOpenAI | |
from llama_index.readers.file import PDFReader | |
from llama_index.core.schema import IndexNode | |
from llama_index.core import Document | |
from langchain_core.messages import HumanMessage | |
from langchain_openai import AzureChatOpenAI | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.chains import ConversationChain | |
from langchain.memory import ConversationBufferWindowMemory | |
from langchain.prompts import PromptTemplate | |
from sentence_transformers import util | |
from datasets import load_dataset | |
from openai import AzureOpenAI | |
from bs4 import BeautifulSoup | |
import pyshorteners | |
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import warnings | |
import pickle | |
import string | |
import json | |
import time | |
import ast | |
import os | |
import re | |
client = AzureOpenAI( | |
azure_endpoint = "https://moj-ada3.openai.azure.com/", | |
api_key="9639718f1a7d478a9313d2b2aeb5dacc", | |
api_version="2024-02-15-preview" | |
) | |
df = pd.read_csv("data/Data.csv") | |
warnings.filterwarnings("ignore") | |
def extract_title(text): | |
if '-' in text: | |
return text.split('-')[-1].strip() | |
elif '–' in text: | |
return text.split('–')[-1].strip() | |
else: | |
return "" | |
def remove_title(text): | |
if '-' in text: | |
return text.split('-')[0].strip() | |
elif '–' in text: | |
return text.split('–')[0].strip() | |
else: | |
return text | |
def get_articles(i): | |
try: | |
result_df = pd.DataFrame(columns=['Header', 'Text','Comment']) | |
#html = df[df['Id'] == 35850]['HTML'][621] | |
html = df['HTML'][i] | |
soup = BeautifulSoup(html, 'html.parser') | |
divs = soup.find_all('div') | |
h_class = 'x__1575___1604___1605___1575___1583___1577_14' | |
x = 0 | |
txt = '' | |
headers = ast.literal_eval(df['Subjects'][i]) | |
for d in divs: | |
try: | |
if d.get('class') is None: | |
d_class = d.find('div').get('class')[0] | |
d_text = d.find('div').text.replace('\n\n',' ').replace('\n',' ') | |
else: | |
d_class = d.get('class')[0] | |
d_text = d.text.replace('\n\n',' ').replace('\n',' ') | |
if h_class not in d_class: | |
txt += " " +d_text | |
else: | |
if x == 0: | |
result_df = pd.concat([result_df, pd.DataFrame({'Header': ['Desc'], 'Text': [txt]})], ignore_index=True) | |
txt = '' | |
x += 1 | |
else: | |
result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True) | |
txt = '' | |
x += 1 | |
except: | |
pass | |
result_df = pd.concat([result_df, pd.DataFrame({'Header': [headers[x-1]], 'Text': [txt]})], ignore_index=True) | |
divs_with_showfn = soup.find_all('div', id=lambda x: x and x.startswith('fn')) | |
for r in range (result_df.shape[0]): | |
article = result_df['Header'][r].split('-')[0].strip() | |
for n,d in enumerate(divs_with_showfn): | |
edit = d.text.replace('\n\n',' ').replace('\n',' ') | |
match = edit[:35] | |
if (article.replace("الأولى","الاولى") in match.replace("الأولى","الاولى")) and ("القديم" in match) : | |
#result_df['Text'][r] += "\n\n-تعديل-\n\n" + edit | |
result_df['Comment'][r] = edit | |
if divs_with_showfn: | |
firstindex = divs_with_showfn[0].text.replace('\n\n',' ').replace('\n',' ') | |
last_e = result_df.shape[0] -1 | |
mada = result_df['Text'][last_e] | |
if firstindex in mada : | |
result_df['Text'][last_e] = (mada.split(firstindex)[0]) | |
#result_df['Title'] = result_df['Header'].apply(extract_title) | |
#result_df['Header'] = result_df['Header'].apply(remove_title) | |
return result_df.reset_index(drop=True) | |
except: | |
pass | |
with open('data/ada_base_index_small.pkl', 'rb') as f: | |
base_index_ = pickle.load(f) | |
azure_endpoint = "https://moj-ada3.openai.azure.com/" | |
api_key="9639718f1a7d478a9313d2b2aeb5dacc" | |
api_version="2024-02-15-preview" | |
deployment = "gpt-35-turbo-16k" | |
os.environ["AZURE_OPENAI_API_KEY"] = api_key | |
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_endpoint | |
llm_chain = AzureChatOpenAI( | |
openai_api_version= api_version, | |
azure_deployment= deployment, | |
) | |
client = AzureOpenAI( | |
azure_endpoint = "https://moj-ada3.openai.azure.com/", | |
api_key="9639718f1a7d478a9313d2b2aeb5dacc", | |
api_version="2024-02-15-preview" | |
) | |
SYS_TEMPLATE = """ | |
The following is a friendly conversation between a human and an AI. | |
AI must follow the Instructions below | |
Instructions: | |
- AI is an Arabic legal expert in the UAE. | |
- AI shall always reply in Arabic. | |
- AI shall never reply in English. | |
- AI shall not repeat any questions or rephrase them. | |
- AI shall ask a presise question if needed to determine the user's intent. | |
- AI shall only ask a maximum of one question if needed to human and then determine his intent. | |
- AI shall only reply to questions related to law subjects. | |
- AI shall not answer or explain or give any advice to user questions. | |
- AI MUST not provide any details ever from given information, only use it to determine the desired intent. | |
- AI shall use the given information only to ask precise and short question to determine user intent. | |
- AI shall determine the user desired intent with the minimum number of questions possible. | |
- AI shall not ask the user again after the user confirms on any question. | |
- AI shall decide user intent if the user's query contains enough details without asiking him any more questions. | |
- AI shall decide which suits query better if user wants a general info or says give me anything. | |
- AI's only purpose is to determine the intended topic from the user. | |
- AI shall choose node with the best description matching with the human's intent. | |
- AI shall always end the conversation with the returns below as long as the user question matches with given info. | |
- if AI asks a question and human says he dosent know the spesific law or article then AI shall determine and end the conversation with the returns below. | |
- if Human asks a question (Is it permissible (هل يجوز)) AI should find the best node that can answer the question with yes or no. | |
- AI shall end the conversation when the user confirms his intent and return as mentioned below from node's metadata. | |
- AI shall mention every detail the user wants in the userintent returns. | |
- AI MUST include the five digits number in the returns. | |
- AI shall never leave the ID in returns empty it should always be five digits. | |
Returns: | |
[ | |
ID: five didgits number , | |
Topic: , | |
userIntent : | |
] | |
Information: | |
{} | |
""" | |
sys_prompt_intent = """ | |
The following is a friendly conversation between a human and an AI. | |
AI must follow the Instructions below | |
Instructions: | |
- AI is an Arabic legal expert in the UAE. | |
- AI shall always reply in Arabic. | |
- AI shall never reply in English. | |
- AI shall answer the human questions based on the content provided. | |
- AI shall answer only from within the Content provided , and NOT from outside. | |
- AI shall answer using the exact text in content and not improvise. | |
- AI shall NOT improvise , or give any advices nor explanation. | |
- AI shall not provide any links to user and tell him to search in it, it should always provide the required info. | |
- AI shall always answer to the user query in a professional and informative way inculding all the details. | |
- ِAI shall answer every question asked in the conversation from human in a detailed way. | |
- AI shall include in the answer the article number (رقم المادة) | |
Content: | |
{} | |
""" | |
punctuations = string.punctuation | |
def generate_embeddings(text, model="ada3_small"): | |
return client.embeddings.create(input = [text], model=model).data[0].embedding | |
base_retriever = base_index_.as_retriever(similarity_top_k=10) | |
def query_df(query): | |
retrievals = base_retriever.retrieve( | |
query | |
) | |
related_texts = [] | |
metadatas = [] | |
info = '' | |
for i,r in enumerate(retrievals): | |
article_index = df[df['Id'] == int(r.metadata['ID'])].index[0] | |
article_df = get_articles(article_index) | |
article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index() | |
article_text = article_intended['Text'][0] | |
if len(article_text) > 800 : | |
related_txt = related_text(article_text, query, 800)[0] | |
else: | |
related_txt = article_text | |
meta = r.metadata | |
meta = { | |
'Description': meta['Description'], | |
'ID': meta['ID'], | |
#'Title': meta['Title'] | |
} | |
info += f"Node Number {i+1} : {related_txt} -- Node MetaData : {meta}\n" | |
return info | |
from llama_index.core.vector_stores.types import ExactMatchFilter, MetadataFilters | |
def query_df_filtered(query,id): | |
filters = MetadataFilters(filters=[ | |
ExactMatchFilter( | |
key="ID", | |
value=str(id) | |
) | |
]) | |
b_retriever = base_index_.as_retriever(similarity_top_k=3, filters=filters) | |
retrievals = b_retriever.retrieve( | |
query | |
) | |
related_texts = [] | |
metadatas = [] | |
info_filtered = '' | |
for i,r in enumerate(retrievals): | |
article_index = df[df['Id'] == int(r.metadata['ID'])].index[0] | |
article_df = get_articles(article_index) | |
article_intended = article_df[article_df['Header'] == r.metadata['Article']].reset_index() | |
article_text = article_intended['Text'][0] | |
if len(article_text) > 5000 : | |
related_txt = related_text(article_text, query, 5000)[0] | |
else: | |
related_txt = article_text | |
meta = r.metadata | |
meta = { | |
#'Title': meta['Title'], | |
'Header' : meta['Article'] | |
} | |
info_filtered += f"Article {meta} : {related_txt} \n" | |
return info_filtered | |
def related_text(txt, q, size): | |
text_splitter = CharacterTextSplitter( | |
separator = " ", | |
chunk_size = size, | |
chunk_overlap = 50, | |
length_function = len, | |
) | |
chunks = text_splitter.split_text(txt) | |
embeddings = [generate_embeddings(chunk) for chunk in chunks] | |
def similarity(q): | |
query_embedding = generate_embeddings(q) | |
similarity_scores = util.cos_sim(query_embedding, embeddings) | |
sorted_indices = np.argsort(-similarity_scores) | |
indexes = [] | |
indexes.append(int(sorted_indices[0][0])) | |
new_chunks = [chunks[i] for i in indexes] | |
ans = '\n'.join(new_chunks) | |
return new_chunks | |
return similarity(q) | |
def format_messages(message_list): | |
formatted_messages = [] | |
current_speaker = None | |
for message in message_list: | |
if 'HumanMessage' in str(type(message)): | |
if current_speaker != 'Human': | |
current_speaker = 'Human' | |
formatted_messages.append(f'{current_speaker} : {message.content}') | |
else: | |
formatted_messages[-1] += f' {message.content}' | |
elif 'AIMessage' in str(type(message)): | |
if current_speaker != 'AI': | |
current_speaker = 'AI' | |
formatted_messages.append(f'{current_speaker} : {message.content}') | |
else: | |
formatted_messages[-1] += f' {message.content}' | |
return '\n'.join(formatted_messages) | |
def memory_prompt(): | |
global history | |
if len (memory.chat_memory.messages) <= 8 : | |
chat_history_lines = format_messages(memory.chat_memory.messages) | |
else: | |
chat_history_lines = format_messages(memory.chat_memory.messages[8:]) | |
prompt = f""" | |
Current conversation: | |
{chat_history_lines} | |
""" | |
return prompt | |
def update_prompt(human, ai): | |
memory.save_context({"input": human}, {"output": ai}) | |
prompt = memory_prompt() | |
return prompt | |
shortener = pyshorteners.Shortener() | |
short_url = shortener.tinyurl.short(df['Links'][0]) | |
mod ="gpt-35-turbo-16k" | |
memory = ConversationBufferWindowMemory() | |
x=0 | |
info = '' | |
history = '' | |
is_locked = False | |
is_found = False | |
new_session = False | |
is_new = False | |
captured_ID = '' | |
user_intent_text = '' | |
full_ans = '' | |
prompt = f""" | |
Current conversation: | |
""" | |
def clean_ans (answer): | |
if answer.startswith("Assistant:"): | |
answer = answer[len("Assistant:"):] | |
elif answer.startswith("AI:"): | |
answer = answer[len("AI:"):] | |
elif answer.startswith("AI :"): | |
answer = answer[len("AI :"):] | |
# if answer.startswith("Assistant:"): | |
# answer = answer[len("Assistant:"):] | |
# answer = answer[:(len(answer)-len("Assistant:"))] | |
# elif answer.startswith("AI:"): | |
# answer = answer[len("AI:"):] | |
# answer = answer[:(len(answer)-len("AI:"))] | |
# elif answer.startswith("AI :"): | |
# answer = answer[len("AI :"):] | |
# answer = answer[:(len(answer)-len("AI :"))] | |
return answer | |
def user(user_message, history): | |
return "", history + [[user_message, None]] | |
def slow_echo(history): | |
global prompt | |
global is_locked | |
global is_found | |
global captured_ID | |
global user_intent_text | |
global x | |
global info | |
global new_session | |
global full_ans | |
global is_new | |
user_message = history[-1][0] | |
my_query = history[-1][0] | |
if x == 0: | |
info = query_df(user_message) | |
x+=1 | |
if is_locked == False: | |
SYS_PROMPT = SYS_TEMPLATE.format(info) | |
USER_PROMPT = prompt.rstrip() + f"\nHuman : {user_message}" | |
message_text=[ | |
{ | |
"role": "system", | |
"content": SYS_PROMPT | |
}, | |
{ | |
"role": "user", | |
"content": USER_PROMPT | |
}, | |
] | |
stream = client.chat.completions.create( | |
model= mod, | |
messages = message_text, | |
temperature=0.0, | |
max_tokens=1700, | |
top_p=0.95, | |
frequency_penalty=0, | |
presence_penalty=0, | |
stop=None, | |
stream=True, | |
) | |
history[-1][1] = "" | |
full_ans ="" | |
cleaned = False | |
is_found = False | |
for chunk in stream: | |
if not chunk.choices: | |
pass | |
else: | |
if chunk.choices[0].delta.content is not None: | |
if is_found == False: | |
if cleaned == False: | |
full_ans += chunk.choices[0].delta.content | |
if len(full_ans) >= 1500 : | |
cleaned = True | |
full_ans = clean_ans(full_ans) | |
if 'id' in full_ans.lower(): | |
is_found = True | |
else: | |
for t in full_ans: | |
time.sleep(0.03) | |
history[-1][1] += t | |
yield history | |
elif cleaned == True: | |
time.sleep(0.03) | |
full_ans += chunk.choices[0].delta.content | |
history[-1][1] += chunk.choices[0].delta.content | |
yield history | |
else: | |
full_ans += chunk.choices[0].delta.content | |
if is_found == False: | |
if len(full_ans) <1500 : | |
if 'id' in full_ans.lower(): | |
is_found = True | |
else: | |
full_ans = clean_ans(full_ans) | |
for t in full_ans: | |
time.sleep(0.02) | |
history[-1][1] += t | |
yield history | |
######################################################################################################## | |
else : | |
full_ans = captured_ID | |
if (is_found) or (is_locked) : | |
if not is_locked: | |
pattern = r'\b\d{5}\b' | |
matches = re.findall(pattern, full_ans) | |
captured_ID = matches[0] | |
matched = re.search(r'user(?:intent)?\s*:\s*(.*)', full_ans, re.IGNORECASE) | |
user_intent_text = (matched.group(1).strip()) | |
user_intent_text = "".join([x for x in user_intent_text if x not in punctuations]) | |
my_query = user_intent_text | |
else: | |
my_query = user_message | |
related_txt = query_df_filtered(my_query, captured_ID) | |
law_df = df[df['Id'] == int(captured_ID)].reset_index() | |
##################################################################2nd | |
SYS_PROMPT = sys_prompt_intent.format(related_txt) | |
USER_PROMPT = prompt.rstrip() + f"\nHuman : {my_query}" | |
message_text=[ | |
{ | |
"role": "system", | |
"content": SYS_PROMPT | |
}, | |
{ | |
"role": "user", | |
"content": USER_PROMPT | |
}, | |
] | |
stream = client.chat.completions.create( | |
model= mod, | |
messages = message_text, | |
temperature=0.0, | |
max_tokens=1500, | |
top_p=0.95, | |
frequency_penalty=0, | |
presence_penalty=0, | |
stop=None, | |
stream=True, | |
) | |
history[-1][1] = "" | |
full_ans = '' | |
for chunk in stream: | |
if not chunk.choices: | |
pass | |
else: | |
if chunk.choices[0].delta.content is not None: | |
time.sleep(0.03) | |
history[-1][1] += clean_ans(chunk.choices[0].delta.content) | |
full_ans += clean_ans(chunk.choices[0].delta.content) | |
yield (history) | |
######################################################################################################## | |
if not is_locked: | |
link = shortener.tinyurl.short(law_df['Links'][0]) | |
law_links = f"\n\nTopic : {law_df['Topic'][0]}\nLink : {link}" | |
for chunk in law_links: | |
time.sleep(0.01) | |
history[-1][1] += chunk | |
yield history | |
is_locked = True | |
else: | |
pass | |
prompt = update_prompt(my_query, full_ans) | |
def test_function(): | |
global new_session | |
global is_locked | |
global is_found | |
global user_intent_text | |
global captured_ID | |
global full_ans | |
global history | |
global info | |
global prompt | |
global x | |
global memory | |
memory = ConversationBufferWindowMemory() | |
new_session = False | |
is_locked = False | |
is_found = False | |
user_intent_text = '' | |
captured_ID = '' | |
full_ans = '' | |
history = '' | |
info = '' | |
x=0 | |
prompt = f""" | |
Current conversation: | |
""" | |
def reset_echo(history): | |
history = [history[0]] | |
yield history | |
welcome_message=" مرحبا معك عمار متخصص في موسوعة القوانين لوزارة العدل بالامارات.كيف يمكنني مساعدتك ؟ " | |
desc = "البوابة القانونية لوزارة العدل - الامارات العربية المتحدة- القوانين والتشريعات" | |
with gr.Blocks(theme=gr.themes.Soft(), title="HI") as demo: | |
with gr.Row(): | |
image_path = "https://i.postimg.cc/kgJGhg32/UAE-MOJ-img.png" | |
gr.Image(image_path, height=120, show_download_button=False, show_label= False) | |
gr.Markdown(value=desc, rtl=True) | |
chatbot = gr.Chatbot(value=[(None,welcome_message)],height=350, rtl=True) | |
with gr.Row(): | |
msg = gr.Textbox(container=False, min_width=750) | |
submit_btn = gr.Button(value="Submit", variant="primary") | |
submit_btn.click() | |
with gr.Row(): | |
new_search = gr.Button(value="بحث جديد") | |
new_search.click(fn=test_function) | |
#gr.ClearButton([msg, chatbot]) | |
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
slow_echo, chatbot, chatbot | |
) | |
submit_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
slow_echo, chatbot, chatbot | |
) | |
new_search.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( | |
reset_echo, chatbot, chatbot | |
) | |
demo.launch(inline=False) |