ZillionParts-PDF2Doc

Running

App Files Files Community

ZillionParts-PDF2Doc / Allam_Backend_HF.py

E-slam

Upload Allam_Backend_HF.py

ee3f9f8 verified 4 months ago

raw

history blame

7.24 kB

	import pandas as pd
	import faiss
	import numpy as np
	import torch
	import requests
	import os
	#import huggingface_hub
	hf_token = os.getenv("hf_token")
	#huggingface_hub.login(hf_token)

	df = pd.read_excel("Allam_SA_Articles.xlsx")
	input_texts = df['Article_text'].tolist()
	MOJ_embeddings = np.load('Allam_embeddings.npy')


	def embed_single_text(query):
	headers = {
	"Authorization": f"Bearer {hf_token}"
	}

	url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}"

	response = requests.get(url, headers=headers)

	if response.status_code == 200:
	return torch.tensor(response.json())
	else:
	print(f"Error: {response.status_code}")
	return None


	#Faiss
	dimension = MOJ_embeddings.shape[1]
	index = faiss.IndexFlatIP(dimension)
	index.add(MOJ_embeddings)

	def query_search(query, K):
	query_embedding = embed_single_text(query)
	distances, indices = index.search(query_embedding, K)

	results = []
	for idx in indices[0]:
	file_id = df.iloc[idx]['File_ID']
	row_number = df.iloc[idx]['Row_Number']
	#results.append((file_id, row_number))
	results.append(idx)
	return results

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	def return_top5_chunks(query):
	matching_indices = query_search(query, 15)
	relevant_rows = df.iloc[matching_indices]

	def chunk_text(text, max_words=150):
	words = text.split()
	return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

	relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text)

	chunked_texts = []
	for idx, row in relevant_rows.iterrows():
	for chunk in row['Chunks']:
	chunked_texts.append((chunk, idx))

	def find_top_k_similar(texts, query, k):
	documents = [text for text, _ in texts]

	vectorizer = TfidfVectorizer()

	all_texts = documents + [query]

	tfidf_matrix = vectorizer.fit_transform(all_texts)

	similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

	top_k_indices = similarities.argsort()[-k:][::-1]
	return [(texts[i], similarities[i]) for i in top_k_indices]

	top_5_chunks = find_top_k_similar(chunked_texts, query, 5)

	chunks_txt = ''
	for i, ((chunk, idx), similarity) in enumerate(top_5_chunks):
	chunks_txt += f"Index: {idx},\nChunk: {chunk}\n"

	if i < len(top_5_chunks) - 1:
	chunks_txt += "##########\n"

	return chunks_txt


	import requests


	api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC'

	url = "https://iam.cloud.ibm.com/identity/token"

	headers = {
	"Content-Type": "application/x-www-form-urlencoded"
	}

	data = {
	"grant_type": "urn:ibm:params:oauth:grant-type:apikey",
	"apikey": api_key
	}

	response = requests.post(url, headers=headers, data=data)
	token_info = response.json()
	access_token = token_info['access_token']




	def allam_response(context, query):
	url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"

	input_text_base = f"""
	[Context]: {context}
	[System]:
	You are an Arabic frindley chatbot named مستنير.
	You will be provided with an Arabic context ,
	Your task is to extract and Answer for the questions only from the context provided
	elaborate on the answer from the context
	At the end of your response mention the Article : مادة
	if no answer is found apologize

	Question: {query}
	"""
	body = {
	"input": input_text_base,
	"parameters": {
	"decoding_method": "greedy",
	"max_new_tokens": 900,
	"min_new_tokens": 0,
	"stop_sequences": [],
	"repetition_penalty": 1
	},
	"model_id": "sdaia/allam-1-13b-instruct",
	"project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
	}

	headers = {
	"Accept": "application/json",
	"Content-Type": "application/json",
	"Authorization": f"Bearer {access_token}"
	}

	response = requests.post(url, headers=headers, json=body)

	if response.status_code != 200:
	raise Exception("Non-200 response: " + str(response.text))

	response = response.json()

	return response['results'][0]['generated_text']



	import json

	import re

	def index_num(text):

	match = re.search(r'"Index":\s*"(\d+)"', text)
	index_number = match.group(1) if match else None

	return int(index_number)

	def get_top_matching_chunk(text, query, max_words=500):
	def chunk_text(text, max_words):
	words = text.split()
	return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

	chunks = chunk_text(text, max_words)

	vectorizer = TfidfVectorizer()
	all_texts = chunks + [query]
	tfidf_matrix = vectorizer.fit_transform(all_texts)

	similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

	top_chunk_index = similarities.argmax()

	return chunks[top_chunk_index]

	def reformat_indentation(text, indent_spaces=4):
	indent = ' ' * indent_spaces

	lines = text.splitlines()

	formatted_lines = [indent + line.strip() for line in lines]

	return '\n'.join(formatted_lines)

	def return_index_num(data_text, query):

	url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"

	sys_prompt = """
	Identify the first Index chunk with the answer to a given question.
	Chunks are seperated by ##########
	Respond only with Json format do not return any words:

	{"Index": "extracted_Index"}

	Or:

	{"Index": "not_found"}

	No additional text allowed.

	"""
	sys_prompt += f"Question : {query}"

	input_text = f"""
	[Context]: {data_text.strip()}
	[System]: {sys_prompt.strip()}
	"""

	input_text = reformat_indentation(input_text, indent_spaces=0)
	body = {
	"input": input_text,
	"parameters": {
	"decoding_method": "greedy",
	"max_new_tokens": 20,
	"repetition_penalty": 1
	},
	"model_id": "sdaia/allam-1-13b-instruct",
	"project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
	}

	headers = {
	"Accept": "application/json",
	"Content-Type": "application/json",
	"Authorization": f"Bearer {access_token}" # access_token must be defined elsewhere
	}


	response = requests.post(url, headers=headers, json=body)

	if response.status_code != 200:
	raise Exception("Non-200 response: " + str(response.text))

	response = response.json()

	return(response['results'][0]['generated_text'])



	def allam_llm(q):

	chunks_text = return_top5_chunks(q)

	targeted_chunk = return_index_num(chunks_text, q)

	index_number = index_num(targeted_chunk)

	text_to_chunk = df['Article_text'][index_number]

	top_chunk = get_top_matching_chunk(text_to_chunk, q)

	allam_res = allam_response(top_chunk, q)

	return allam_res