import pandas as pd import faiss import numpy as np import torch import requests import os #import huggingface_hub hf_token = os.getenv("hf_token") #huggingface_hub.login(hf_token) df = pd.read_excel("Allam_SA_Articles.xlsx") input_texts = df['Article_text'].tolist() MOJ_embeddings = np.load('Allam_embeddings.npy') def embed_single_text(query): headers = { "Authorization": f"Bearer {hf_token}" } url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}" response = requests.get(url, headers=headers) if response.status_code == 200: return torch.tensor(response.json()) else: print(f"Error: {response.status_code}") return None #Faiss dimension = MOJ_embeddings.shape[1] index = faiss.IndexFlatIP(dimension) index.add(MOJ_embeddings) def query_search(query, K): query_embedding = embed_single_text(query) distances, indices = index.search(query_embedding, K) results = [] for idx in indices[0]: file_id = df.iloc[idx]['File_ID'] row_number = df.iloc[idx]['Row_Number'] #results.append((file_id, row_number)) results.append(idx) return results from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def return_top5_chunks(query): matching_indices = query_search(query, 15) relevant_rows = df.iloc[matching_indices] def chunk_text(text, max_words=150): words = text.split() return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)] relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text) chunked_texts = [] for idx, row in relevant_rows.iterrows(): for chunk in row['Chunks']: chunked_texts.append((chunk, idx)) def find_top_k_similar(texts, query, k): documents = [text for text, _ in texts] vectorizer = TfidfVectorizer() all_texts = documents + [query] tfidf_matrix = vectorizer.fit_transform(all_texts) similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten() top_k_indices = similarities.argsort()[-k:][::-1] return [(texts[i], similarities[i]) for i in top_k_indices] top_5_chunks = find_top_k_similar(chunked_texts, query, 5) chunks_txt = '' for i, ((chunk, idx), similarity) in enumerate(top_5_chunks): chunks_txt += f"Index: {idx},\nChunk: {chunk}\n" if i < len(top_5_chunks) - 1: chunks_txt += "##########\n" return chunks_txt import requests api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC' url = "https://iam.cloud.ibm.com/identity/token" headers = { "Content-Type": "application/x-www-form-urlencoded" } data = { "grant_type": "urn:ibm:params:oauth:grant-type:apikey", "apikey": api_key } response = requests.post(url, headers=headers, data=data) token_info = response.json() access_token = token_info['access_token'] def allam_response(context, query): url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29" input_text_base = f""" [Context]: {context} [System]: You are an Arabic frindley chatbot named مستنير. You will be provided with an Arabic context , Your task is to extract and Answer for the questions only from the context provided elaborate on the answer from the context At the end of your response mention the Article : مادة if no answer is found apologize Question: {query} """ body = { "input": input_text_base, "parameters": { "decoding_method": "greedy", "max_new_tokens": 900, "min_new_tokens": 0, "stop_sequences": [], "repetition_penalty": 1 }, "model_id": "sdaia/allam-1-13b-instruct", "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936" } headers = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": f"Bearer {access_token}" } response = requests.post(url, headers=headers, json=body) if response.status_code != 200: raise Exception("Non-200 response: " + str(response.text)) response = response.json() return response['results'][0]['generated_text'] import json import re def index_num(text): match = re.search(r'"Index":\s*"(\d+)"', text) index_number = match.group(1) if match else None return int(index_number) def get_top_matching_chunk(text, query, max_words=500): def chunk_text(text, max_words): words = text.split() return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)] chunks = chunk_text(text, max_words) vectorizer = TfidfVectorizer() all_texts = chunks + [query] tfidf_matrix = vectorizer.fit_transform(all_texts) similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten() top_chunk_index = similarities.argmax() return chunks[top_chunk_index] def reformat_indentation(text, indent_spaces=4): indent = ' ' * indent_spaces lines = text.splitlines() formatted_lines = [indent + line.strip() for line in lines] return '\n'.join(formatted_lines) def return_index_num(data_text, query): url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29" sys_prompt = """ Identify the **first** Index chunk with the answer to a given question. Chunks are seperated by ########## Respond only with **Json** format **do not return any words**: {"Index": "extracted_Index"} Or: {"Index": "not_found"} **No additional text allowed**. """ sys_prompt += f"Question : {query}" input_text = f""" [Context]: {data_text.strip()} [System]: {sys_prompt.strip()} """ input_text = reformat_indentation(input_text, indent_spaces=0) body = { "input": input_text, "parameters": { "decoding_method": "greedy", "max_new_tokens": 20, "repetition_penalty": 1 }, "model_id": "sdaia/allam-1-13b-instruct", "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936" } headers = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": f"Bearer {access_token}" # access_token must be defined elsewhere } response = requests.post(url, headers=headers, json=body) if response.status_code != 200: raise Exception("Non-200 response: " + str(response.text)) response = response.json() return(response['results'][0]['generated_text']) def allam_llm(q): chunks_text = return_top5_chunks(q) targeted_chunk = return_index_num(chunks_text, q) index_number = index_num(targeted_chunk) text_to_chunk = df['Article_text'][index_number] top_chunk = get_top_matching_chunk(text_to_chunk, q) allam_res = allam_response(top_chunk, q) return allam_res