Spaces:
Running
Running
import pandas as pd | |
import faiss | |
import numpy as np | |
import torch | |
import requests | |
import os | |
#import huggingface_hub | |
hf_token = os.getenv("hf_token") | |
#huggingface_hub.login(hf_token) | |
df = pd.read_excel("Allam_SA_Articles.xlsx") | |
input_texts = df['Article_text'].tolist() | |
MOJ_embeddings = np.load('Allam_embeddings.npy') | |
def embed_single_text(query): | |
headers = { | |
"Authorization": f"Bearer {hf_token}" | |
} | |
url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}" | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
return torch.tensor(response.json()) | |
else: | |
print(f"Error: {response.status_code}") | |
return None | |
#Faiss | |
dimension = MOJ_embeddings.shape[1] | |
index = faiss.IndexFlatIP(dimension) | |
index.add(MOJ_embeddings) | |
def query_search(query, K): | |
query_embedding = embed_single_text(query) | |
distances, indices = index.search(query_embedding, K) | |
results = [] | |
for idx in indices[0]: | |
file_id = df.iloc[idx]['File_ID'] | |
row_number = df.iloc[idx]['Row_Number'] | |
#results.append((file_id, row_number)) | |
results.append(idx) | |
return results | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
def return_top5_chunks(query): | |
matching_indices = query_search(query, 15) | |
relevant_rows = df.iloc[matching_indices] | |
def chunk_text(text, max_words=150): | |
words = text.split() | |
return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)] | |
relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text) | |
chunked_texts = [] | |
for idx, row in relevant_rows.iterrows(): | |
for chunk in row['Chunks']: | |
chunked_texts.append((chunk, idx)) | |
def find_top_k_similar(texts, query, k): | |
documents = [text for text, _ in texts] | |
vectorizer = TfidfVectorizer() | |
all_texts = documents + [query] | |
tfidf_matrix = vectorizer.fit_transform(all_texts) | |
similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten() | |
top_k_indices = similarities.argsort()[-k:][::-1] | |
return [(texts[i], similarities[i]) for i in top_k_indices] | |
top_5_chunks = find_top_k_similar(chunked_texts, query, 5) | |
chunks_txt = '' | |
for i, ((chunk, idx), similarity) in enumerate(top_5_chunks): | |
chunks_txt += f"Index: {idx},\nChunk: {chunk}\n" | |
if i < len(top_5_chunks) - 1: | |
chunks_txt += "##########\n" | |
return chunks_txt | |
import requests | |
api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC' | |
url = "https://iam.cloud.ibm.com/identity/token" | |
headers = { | |
"Content-Type": "application/x-www-form-urlencoded" | |
} | |
data = { | |
"grant_type": "urn:ibm:params:oauth:grant-type:apikey", | |
"apikey": api_key | |
} | |
response = requests.post(url, headers=headers, data=data) | |
token_info = response.json() | |
access_token = token_info['access_token'] | |
def allam_response(context, query): | |
url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29" | |
input_text_base = f""" | |
[Context]: {context} | |
[System]: | |
You are an Arabic frindley chatbot named مستنير. | |
You will be provided with an Arabic context , | |
Your task is to extract and Answer for the questions only from the context provided | |
elaborate on the answer from the context | |
At the end of your response mention the Article : مادة | |
if no answer is found apologize | |
Question: {query} | |
""" | |
body = { | |
"input": input_text_base, | |
"parameters": { | |
"decoding_method": "greedy", | |
"max_new_tokens": 900, | |
"min_new_tokens": 0, | |
"stop_sequences": [], | |
"repetition_penalty": 1 | |
}, | |
"model_id": "sdaia/allam-1-13b-instruct", | |
"project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936" | |
} | |
headers = { | |
"Accept": "application/json", | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {access_token}" | |
} | |
response = requests.post(url, headers=headers, json=body) | |
if response.status_code != 200: | |
raise Exception("Non-200 response: " + str(response.text)) | |
response = response.json() | |
return response['results'][0]['generated_text'] | |
import json | |
import re | |
def index_num(text): | |
match = re.search(r'"Index":\s*"(\d+)"', text) | |
index_number = match.group(1) if match else None | |
return int(index_number) | |
def get_top_matching_chunk(text, query, max_words=500): | |
def chunk_text(text, max_words): | |
words = text.split() | |
return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)] | |
chunks = chunk_text(text, max_words) | |
vectorizer = TfidfVectorizer() | |
all_texts = chunks + [query] | |
tfidf_matrix = vectorizer.fit_transform(all_texts) | |
similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten() | |
top_chunk_index = similarities.argmax() | |
return chunks[top_chunk_index] | |
def reformat_indentation(text, indent_spaces=4): | |
indent = ' ' * indent_spaces | |
lines = text.splitlines() | |
formatted_lines = [indent + line.strip() for line in lines] | |
return '\n'.join(formatted_lines) | |
def return_index_num(data_text, query): | |
url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29" | |
sys_prompt = """ | |
Identify the **first** Index chunk with the answer to a given question. | |
Chunks are seperated by ########## | |
Respond only with **Json** format **do not return any words**: | |
{"Index": "extracted_Index"} | |
Or: | |
{"Index": "not_found"} | |
**No additional text allowed**. | |
""" | |
sys_prompt += f"Question : {query}" | |
input_text = f""" | |
[Context]: {data_text.strip()} | |
[System]: {sys_prompt.strip()} | |
""" | |
input_text = reformat_indentation(input_text, indent_spaces=0) | |
body = { | |
"input": input_text, | |
"parameters": { | |
"decoding_method": "greedy", | |
"max_new_tokens": 20, | |
"repetition_penalty": 1 | |
}, | |
"model_id": "sdaia/allam-1-13b-instruct", | |
"project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936" | |
} | |
headers = { | |
"Accept": "application/json", | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {access_token}" # access_token must be defined elsewhere | |
} | |
response = requests.post(url, headers=headers, json=body) | |
if response.status_code != 200: | |
raise Exception("Non-200 response: " + str(response.text)) | |
response = response.json() | |
return(response['results'][0]['generated_text']) | |
def allam_llm(q): | |
chunks_text = return_top5_chunks(q) | |
targeted_chunk = return_index_num(chunks_text, q) | |
index_number = index_num(targeted_chunk) | |
text_to_chunk = df['Article_text'][index_number] | |
top_chunk = get_top_matching_chunk(text_to_chunk, q) | |
allam_res = allam_response(top_chunk, q) | |
return allam_res |