ZillionParts-PDF2Doc / Allam_Backend_HF.py
E-slam's picture
Upload Allam_Backend_HF.py
ee3f9f8 verified
raw
history blame
7.24 kB
import pandas as pd
import faiss
import numpy as np
import torch
import requests
import os
#import huggingface_hub
hf_token = os.getenv("hf_token")
#huggingface_hub.login(hf_token)
df = pd.read_excel("Allam_SA_Articles.xlsx")
input_texts = df['Article_text'].tolist()
MOJ_embeddings = np.load('Allam_embeddings.npy')
def embed_single_text(query):
headers = {
"Authorization": f"Bearer {hf_token}"
}
url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}"
response = requests.get(url, headers=headers)
if response.status_code == 200:
return torch.tensor(response.json())
else:
print(f"Error: {response.status_code}")
return None
#Faiss
dimension = MOJ_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(MOJ_embeddings)
def query_search(query, K):
query_embedding = embed_single_text(query)
distances, indices = index.search(query_embedding, K)
results = []
for idx in indices[0]:
file_id = df.iloc[idx]['File_ID']
row_number = df.iloc[idx]['Row_Number']
#results.append((file_id, row_number))
results.append(idx)
return results
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def return_top5_chunks(query):
matching_indices = query_search(query, 15)
relevant_rows = df.iloc[matching_indices]
def chunk_text(text, max_words=150):
words = text.split()
return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text)
chunked_texts = []
for idx, row in relevant_rows.iterrows():
for chunk in row['Chunks']:
chunked_texts.append((chunk, idx))
def find_top_k_similar(texts, query, k):
documents = [text for text, _ in texts]
vectorizer = TfidfVectorizer()
all_texts = documents + [query]
tfidf_matrix = vectorizer.fit_transform(all_texts)
similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
top_k_indices = similarities.argsort()[-k:][::-1]
return [(texts[i], similarities[i]) for i in top_k_indices]
top_5_chunks = find_top_k_similar(chunked_texts, query, 5)
chunks_txt = ''
for i, ((chunk, idx), similarity) in enumerate(top_5_chunks):
chunks_txt += f"Index: {idx},\nChunk: {chunk}\n"
if i < len(top_5_chunks) - 1:
chunks_txt += "##########\n"
return chunks_txt
import requests
api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC'
url = "https://iam.cloud.ibm.com/identity/token"
headers = {
"Content-Type": "application/x-www-form-urlencoded"
}
data = {
"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
"apikey": api_key
}
response = requests.post(url, headers=headers, data=data)
token_info = response.json()
access_token = token_info['access_token']
def allam_response(context, query):
url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
input_text_base = f"""
[Context]: {context}
[System]:
You are an Arabic frindley chatbot named مستنير.
You will be provided with an Arabic context ,
Your task is to extract and Answer for the questions only from the context provided
elaborate on the answer from the context
At the end of your response mention the Article : مادة
if no answer is found apologize
Question: {query}
"""
body = {
"input": input_text_base,
"parameters": {
"decoding_method": "greedy",
"max_new_tokens": 900,
"min_new_tokens": 0,
"stop_sequences": [],
"repetition_penalty": 1
},
"model_id": "sdaia/allam-1-13b-instruct",
"project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
}
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": f"Bearer {access_token}"
}
response = requests.post(url, headers=headers, json=body)
if response.status_code != 200:
raise Exception("Non-200 response: " + str(response.text))
response = response.json()
return response['results'][0]['generated_text']
import json
import re
def index_num(text):
match = re.search(r'"Index":\s*"(\d+)"', text)
index_number = match.group(1) if match else None
return int(index_number)
def get_top_matching_chunk(text, query, max_words=500):
def chunk_text(text, max_words):
words = text.split()
return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
chunks = chunk_text(text, max_words)
vectorizer = TfidfVectorizer()
all_texts = chunks + [query]
tfidf_matrix = vectorizer.fit_transform(all_texts)
similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
top_chunk_index = similarities.argmax()
return chunks[top_chunk_index]
def reformat_indentation(text, indent_spaces=4):
indent = ' ' * indent_spaces
lines = text.splitlines()
formatted_lines = [indent + line.strip() for line in lines]
return '\n'.join(formatted_lines)
def return_index_num(data_text, query):
url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
sys_prompt = """
Identify the **first** Index chunk with the answer to a given question.
Chunks are seperated by ##########
Respond only with **Json** format **do not return any words**:
{"Index": "extracted_Index"}
Or:
{"Index": "not_found"}
**No additional text allowed**.
"""
sys_prompt += f"Question : {query}"
input_text = f"""
[Context]: {data_text.strip()}
[System]: {sys_prompt.strip()}
"""
input_text = reformat_indentation(input_text, indent_spaces=0)
body = {
"input": input_text,
"parameters": {
"decoding_method": "greedy",
"max_new_tokens": 20,
"repetition_penalty": 1
},
"model_id": "sdaia/allam-1-13b-instruct",
"project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
}
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": f"Bearer {access_token}" # access_token must be defined elsewhere
}
response = requests.post(url, headers=headers, json=body)
if response.status_code != 200:
raise Exception("Non-200 response: " + str(response.text))
response = response.json()
return(response['results'][0]['generated_text'])
def allam_llm(q):
chunks_text = return_top5_chunks(q)
targeted_chunk = return_index_num(chunks_text, q)
index_number = index_num(targeted_chunk)
text_to_chunk = df['Article_text'][index_number]
top_chunk = get_top_matching_chunk(text_to_chunk, q)
allam_res = allam_response(top_chunk, q)
return allam_res