ZillionParts-PDF2Doc

Running

File size: 7,243 Bytes

ee3f9f8

import pandas as pd
import faiss
import numpy as np
import torch
import requests
import os
#import huggingface_hub
hf_token = os.getenv("hf_token")
#huggingface_hub.login(hf_token)

df = pd.read_excel("Allam_SA_Articles.xlsx")
input_texts = df['Article_text'].tolist()
MOJ_embeddings = np.load('Allam_embeddings.npy')


def embed_single_text(query):
    headers = {
        "Authorization": f"Bearer {hf_token}"
    }

    url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}"

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return torch.tensor(response.json())
    else:
        print(f"Error: {response.status_code}")
        return None


#Faiss
dimension = MOJ_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(MOJ_embeddings)

def query_search(query, K):
    query_embedding = embed_single_text(query)
    distances, indices = index.search(query_embedding, K)

    results = []
    for idx in indices[0]:
        file_id = df.iloc[idx]['File_ID']
        row_number = df.iloc[idx]['Row_Number']
        #results.append((file_id, row_number))
        results.append(idx)
    return results      

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def return_top5_chunks(query):
  matching_indices = query_search(query, 15)
  relevant_rows = df.iloc[matching_indices]

  def chunk_text(text, max_words=150):
      words = text.split()
      return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

  relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text)

  chunked_texts = []
  for idx, row in relevant_rows.iterrows():
      for chunk in row['Chunks']:
          chunked_texts.append((chunk, idx))  

  def find_top_k_similar(texts, query, k):
      documents = [text for text, _ in texts]

      vectorizer = TfidfVectorizer()

      all_texts = documents + [query]

      tfidf_matrix = vectorizer.fit_transform(all_texts)

      similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

      top_k_indices = similarities.argsort()[-k:][::-1]
      return [(texts[i], similarities[i]) for i in top_k_indices]

  top_5_chunks = find_top_k_similar(chunked_texts, query, 5)

  chunks_txt = ''
  for i, ((chunk, idx), similarity) in enumerate(top_5_chunks):
      chunks_txt += f"Index: {idx},\nChunk: {chunk}\n"

      if i < len(top_5_chunks) - 1:
          chunks_txt += "##########\n"

  return chunks_txt


import requests


api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC'

url = "https://iam.cloud.ibm.com/identity/token"

headers = {
    "Content-Type": "application/x-www-form-urlencoded"
}

data = {
    "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
    "apikey": api_key
}

response = requests.post(url, headers=headers, data=data)
token_info = response.json()
access_token = token_info['access_token']  




def allam_response(context, query):
    url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"

    input_text_base = f"""

    [Context]: {context}

    [System]:

    You are an Arabic frindley chatbot named مستنير.

    You will be provided with an Arabic context ,

    Your task is to extract and Answer for the questions only from the context provided

    elaborate on the answer from the context

    At the end of your response mention the Article : مادة

    if no answer is found apologize



    Question: {query}

    """
    body = {
        "input": input_text_base,
        "parameters": {
          "decoding_method": "greedy",
          "max_new_tokens": 900,
          "min_new_tokens": 0,
          "stop_sequences": [],
          "repetition_penalty": 1
        },
        "model_id": "sdaia/allam-1-13b-instruct",
        "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
    }

    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json",
        "Authorization": f"Bearer {access_token}"
    }

    response = requests.post(url, headers=headers, json=body)

    if response.status_code != 200:
        raise Exception("Non-200 response: " + str(response.text))

    response = response.json()

    return response['results'][0]['generated_text']



import json

import re

def index_num(text):

    match = re.search(r'"Index":\s*"(\d+)"', text)
    index_number = match.group(1) if match else None

    return int(index_number)

def get_top_matching_chunk(text, query, max_words=500):
    def chunk_text(text, max_words):
        words = text.split()
        return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

    chunks = chunk_text(text, max_words)

    vectorizer = TfidfVectorizer()
    all_texts = chunks + [query]
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    top_chunk_index = similarities.argmax()

    return chunks[top_chunk_index]

def reformat_indentation(text, indent_spaces=4):
    indent = ' ' * indent_spaces

    lines = text.splitlines()

    formatted_lines = [indent + line.strip() for line in lines]

    return '\n'.join(formatted_lines)

def return_index_num(data_text, query):

    url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"

    sys_prompt = """

    Identify the **first** Index chunk with the answer to a given question.

    Chunks are seperated by ##########

    Respond only with **Json** format **do not return any words**:



    {"Index": "extracted_Index"}



    Or:



    {"Index": "not_found"}



    **No additional text allowed**.



    """
    sys_prompt += f"Question : {query}"

    input_text = f"""

    [Context]: {data_text.strip()}

    [System]: {sys_prompt.strip()}

    """

    input_text = reformat_indentation(input_text, indent_spaces=0)
    body = {
      "input": input_text,
      "parameters": {
          "decoding_method": "greedy",
          "max_new_tokens": 20,
          "repetition_penalty": 1
      },
      "model_id": "sdaia/allam-1-13b-instruct",
      "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
    }

    headers = {
      "Accept": "application/json",
      "Content-Type": "application/json",
      "Authorization": f"Bearer {access_token}"  # access_token must be defined elsewhere
    }


    response = requests.post(url, headers=headers, json=body)

    if response.status_code != 200:
      raise Exception("Non-200 response: " + str(response.text))

    response = response.json()

    return(response['results'][0]['generated_text'])



def allam_llm(q):

    chunks_text = return_top5_chunks(q)

    targeted_chunk = return_index_num(chunks_text, q)

    index_number = index_num(targeted_chunk)

    text_to_chunk = df['Article_text'][index_number]

    top_chunk = get_top_matching_chunk(text_to_chunk, q)

    allam_res = allam_response(top_chunk, q) 

    return allam_res