Spaces:

elia-waefler
/

reverse-RAG

No application file

App Files Files Community

elia-waefler commited on May 10, 2024

Commit

8b9aa2c

1 Parent(s): 661b653

init files, idea

Browse files

Files changed (6) hide show

.idea/.name +1 -0
my_1_openai.py +59 -0
my_1_reader.py +201 -0
my_1_writer.py +98 -0
my_2_embedder.py +169 -0
my_2_sim_search.py +109 -0

.idea/.name ADDED Viewed

	@@ -0,0 +1 @@


1	+ app.py

my_1_openai.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+from openai import OpenAI
+openai.api_key = os.environ.get("OPENAI_API_KEY")
+openai.organization = os.environ.get("OPENAI_ORG_ID")
+models = {
+    "assistant": "You are a helpful assistant.",
+    "binary": "you are a maschine that converts questions or prompts to binary outputs. "
+              "you can only answer 'yes' or 'no'. if uncertain, default to 'no'."
+}
+def gpt4_new(prompt_text):
+    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
+    response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role": "system",
+                   "content":   "Du bist eine Maschine, die Dokumente klassifiziert."},
+                  {"role": "user", "content": prompt_text}])
+    return response.choices[0].message.content
+def gpt4(prompt, model=models["assistant"]):
+    response = openai.Completion.create(
+        model="gpt-4",
+        messages=[
+            {"role": "system", "content": model},
+            {"role": "user", "content": prompt}
+        ]
+    )
+    return response.choices[0].message['content']
+def gpt_bool(prompt):
+    """
+    :param prompt: the text prompt
+    :return: True or False
+    """
+    true_values = ["yes", "Yes", "Y", "y", "yes.", "Yes.", "YES"]
+    return bool(gpt4(prompt, model=models["binary"]) in true_values)
+def vectorize_data(data_input):
+    try:
+        response = openai.Embedding.create(input=data_input, model="text-embedding-ada-002")
+    except openai.error.InvalidRequestError as err:
+        print(err)
+        return [0, 0, 0]
+    return response['data'][0]['embedding']
+if __name__ == "__main__":
+    print("here are all functions that directly call openai.")
+    print("hi, im chatGPT how can I help? ")
+    while True:
+        print(gpt_bool(input()))

my_1_reader.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# MUSS AUFGERÄUMT WERDEN
+import json
+import os
+import subprocess
+import PyPDF2
+import csv
+import fitz  # PyMuPDF
+def extract_text_from_pdf(pdf_path):
+    """
+    Extracts all text from a PDF file.
+    :param pdf_path: Path to the PDF file.
+    :return: Extracted text as a string.
+    """
+    # Open the PDF file
+    doc = fitz.open(pdf_path)
+    # Initialize an empty string to hold the text
+    text = ''
+    # Iterate through each page in the PDF
+    for page_num in range(len(doc)):
+        # Get a page
+        page = doc.load_page(page_num)
+        # Extract text from the page and add it to the result
+        text += page.get_text()
+    # Close the document
+    doc.close()
+    return text
+def read_pdfs_from_folder(folder_path):
+    """
+    Reads all PDF files in the specified folder using PdfReader and extracts their text.
+    Parameters:
+    - folder_path: The path to the folder containing PDF files.
+    Returns:
+    - A dictionary with file names as keys and their extracted text as values.
+    """
+    pdf_texts = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.pdf'):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, 'rb') as pdf_file:
+                pdf_reader = PyPDF2.PdfReader(pdf_file)
+                text = ''
+                for page in pdf_reader.pages:
+                    try:
+                        text += page.extract_text()
+                    except UnicodeDecodeError as e:
+                        print(e)
+                for c in text:
+                    if c in ["ä", "Ä"]:
+                        text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
+                    if c in ["ö", "Ö"]:
+                        text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
+                    if c in ["ü", "Ü"]:
+                        text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
+                    if c in [",", ";", "\\", '"']:
+                        text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
+                    if c in ["/n", "\n"]:
+                        text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
+                pdf_texts[filename] = text
+    return pdf_texts
+def read_csv_lines_as_strings(filename):
+    """
+    Opens a CSV file and returns each line as a string in a list.
+    Parameters:
+    - filename: The path to the CSV file.
+    Returns:
+    - A list of strings, each representing a line from the CSV file.
+    """
+    lines_as_strings = []
+    with open(filename, newline='') as csvfile:
+        try:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                # Convert the row (a list of values) back into a comma-separated string
+                line_as_string = ','.join(row)
+                lines_as_strings.append(line_as_string)
+        except UnicodeDecodeError as e:
+            print(e)
+    return lines_as_strings
+# Function to load data from JSON files
+def load_data(filename):
+    with open(filename, 'r') as file:
+        try:
+            return json.load(file)
+        except UnicodeDecodeError as err:
+            print(err)
+            return {}
+def find_and_open_file(filename, start_directory):
+    """
+    Attempts to open a file with the given filename starting from the specified directory.
+    If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
+    """
+    for root, dirs, files in os.walk(start_directory):
+        if filename in files:
+            filepath = os.path.join(root, filename)
+            print(f"File found: {filepath}")
+            return filepath
+    print(f"File {filename} not found.")
+    return None
+def open_file(filepath):
+    """
+    Opens the file with the default application, based on the operating system.
+    """
+    if os.path.exists(filepath):
+        if os.name == 'posix':  # Linux, macOS, etc.
+            subprocess.call(('open', filepath))
+        elif os.name == 'nt':  # Windows
+            os.startfile(filepath)
+        else:
+            print(f"Cannot open file on this operating system: {filepath}")
+    else:
+        print(f"File does not exist: {filepath}")
+def list_folders_files_recursive(path, depth=0):
+    """
+    Recursively lists all folders and files within the specified path, including subfolders.
+    Parameters:
+    - path: The directory path to list contents from.
+    - depth: The current depth of recursion (used for indentation in print statements).
+    Returns:
+    - None
+    """
+    # Ensure the provided path is a directory
+    if not os.path.isdir(path):
+        print(f"The provided path '{path}' is not a valid directory.")
+        return
+    indent = '  ' * depth  # Indentation based on recursion depth
+    folders, files = [], []
+    # List all entries in the directory
+    for entry in os.listdir(path):
+        full_path = os.path.join(path, entry)
+        if os.path.isdir(full_path):
+            folders.append(entry)
+            print(f"{indent}Folder: {entry}")
+            # Recursively list subfolders and files
+            list_folders_files_recursive(full_path, depth + 1)
+        elif os.path.isfile(full_path):
+            files.append(entry)
+    for f in files:
+        print(f"{indent}File: {f}")
+def list_folders_files(path):
+    """
+    Lists all folders and files within the specified path.
+    Parameters:
+    - path: The directory path to list contents from.
+    Returns:
+    - A tuple of two lists: (folders, files).
+    """
+    folders = []
+    files = []
+    # Ensure the provided path is a directory
+    if not os.path.isdir(path):
+        print(f"The provided path '{path}' is not a valid directory.")
+        return folders, files
+    # List all entries in the directory
+    for entry in os.listdir(path):
+        full_path = os.path.join(path, entry)
+        if os.path.isdir(full_path):
+            folders.append(entry)
+        elif os.path.isfile(full_path):
+            files.append(entry)
+    return folders, files
+if __name__ == "__main__":
+    print("here are all functions that read files")

my_1_writer.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# MUSS AUFGERÄUMT WERDEN
+import json
+import pandas as pd
+def split_json_file(input_filepath, lines_per_file=50):
+    """
+    Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.
+    param input_filepath: The path to the input JSON file.
+    param lines_per_file: The maximum number of lines per output file.
+    """
+    # Counter for file naming
+    file_counter = 1
+    # Open the input file
+    with open(input_filepath, 'r') as input_file:
+        # Read the lines from the input file
+        lines = input_file.readlines()
+        # Iterate through the lines in chunks of 'lines_per_file'
+        for i in range(0, len(lines), lines_per_file):
+            # Determine the output file name
+            output_filename = f'translate_data/english_{file_counter}.json'
+            # Write the current chunk to the output file
+            with open(output_filename, 'w') as output_file:
+                # Grab the current chunk of lines
+                chunk = lines[i:i+lines_per_file]
+                # Write each line to the output file
+                for line in chunk:
+                    output_file.write(line)
+            print(f'Created {output_filename}')
+            # Increment the file counter
+            file_counter += 1
+def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
+    """
+    Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
+    headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
+    and saves it as a CSV file.
+    Parameters:
+    - list1 (list): First list to merge, contributing to column 'list1'.
+    - list2 (list): Second list to merge, contributing to column 'list2'.
+    - dict1 (dict): First dictionary to merge, keys and values added as separate columns.
+    - dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
+    - filename (str): Filename for the saved CSV file.
+    """
+    # Combining all elements into a structured list of dictionaries for DataFrame construction
+    data = []
+    dict1_items = list(dict1.items())
+    dict2_items = list(dict2.items())
+    for i in range(len(list1)):
+        row = {
+            'list1': list1[i],
+            'list2': list2[i],
+            'keys dict1': dict1_items[i][0],
+            'vals dict1': dict1_items[i][1],
+            'keys dict2': dict2_items[i][0],
+            'vals dict2': dict2_items[i][1]
+        }
+        data.append(row)
+    # Creating the DataFrame
+    df = pd.DataFrame(data)
+    # Saving the DataFrame to a CSV file
+    df.to_csv(filename, index=False)
+    print(f"DataFrame saved as '{filename}' in the current directory.")
+# new line for every entry
+def safe_my_dict_as_json(file_name, my_dict):
+    print(my_dict)
+    # Open a file for writing
+    with open(file_name, 'w') as f:
+        # Write the opening brace of the JSON object
+        f.write('{\n')
+        # Get total number of items to control comma insertion
+        total_items = len(my_dict)
+        if type(my_dict) == list:
+            my_dict = my_dict[0]
+        # Iterate over items, keeping track of the current item index
+        for i, (key, value) in enumerate(my_dict.items()):
+            # Serialize the key with JSON to handle special characters and ensure proper quoting
+            json_key = json.dumps(key)
+            # Convert the list to a JSON-formatted string (without indentation)
+            json_value = json.dumps(value)
+            # Determine if a comma is needed (for all but the last item)
+            comma = ',' if i < total_items - 1 else ''
+            # Write the formatted string to the file
+            f.write(f"    {json_key}: {json_value}{comma}\n")
+        # Write the closing brace of the JSON object
+        f.write('}\n')
+if __name__ == "__main__":
+    print("here are all functions that write to the Datasets")

my_2_embedder.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import my_1_reader
+import my_1_writer
+import my_1_openai
+import os
+import openai
+import pdf2image
+from pdf2image import convert_from_path
+from PIL import Image
+import csv
+import numpy as np
+import os
+import pdfminer
+from pdf2image import convert_from_path
+import csv
+import numpy as np
+import os
+# Assuming your my_1_openai's vectorize functions work as described
+def vectorize_data(data):
+    # Replace this with your actual logic to vectorize text data
+    return np.random.rand(100).tolist()  # Example vector
+def vectorize_image(data):
+    # Replace this with your actual logic to vectorize image data
+    return np.random.rand(100).tolist()  # Example vector
+def vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
+    tensor_description = {
+        "my_id": 89,  # Example ID, ideally this should be dynamically generated
+        "og_name": pdf_path,
+        "metadata": {},
+        "vec_content_text": [],
+        "vec_content_img": []
+    }
+    # Read metadata from CSV and match by 'og_name' (pdf_path)
+    # 'latin1', 'ISO-8859-1', or 'cp1252'
+    with open(metadata_filename, mode='r', encoding='utf-8') as csvfile:
+        csv_reader = csv.DictReader(csvfile)
+        for row in csv_reader:
+            if row["Name"] == os.path.basename(pdf_path):  # Assuming 'Name' is a column in your CSV
+                tensor_description['metadata'] = row
+                break
+    # get text content
+    text = my_1_reader.extract_text_from_pdf(pdf_path)
+    # Vectorize extracted text
+    if text:
+        tensor_description['vec_content_text'].append(vectorize_data(data=text))
+    # Convert PDF pages to images using pdf2image
+    images = convert_from_path(pdf_path)
+    for img in images:
+        # Assume vectorize_image expects a PIL image; pdf2image.convert_from_path already returns PIL images
+        img_vector = vectorize_image(data=img)
+        tensor_description['vec_content_img'].append(img_vector)
+    # Here, instead of saving the tensor, we'll simply print it as an example
+    print(tensor_description)
+    return tensor_description
+def vectorize_this_pdf_with_metadata_old(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
+    # get PDF content, split into chunks
+    tensor_description = { #sample
+        "my_id": 89,        #nummerate how often the func has been called
+        "og_name": pdf_path,
+        "metadata": {"a": 1, "b": 2, "c": 3},  # get from metadata_filename, get the full row, with the same filename in column A
+        "vec_content_text": [[0.03874, 0.03947, -0.0875], [-0.03234, 0.03437, -0.011234]], # vectorize all chunks of all the text in the PDF
+        # call my_1_openai.vectorize_data(data="string") this function returns the vector from Ada002 as a list
+        "vec_content_img": [[0.01234, 0.09875, -0.0542], [-0.02456, 0.03537, -0.016634]]
+        # for the images make every pdf into an img using pdf2image
+        # call my_1_openai.vecotrize_image(data=PIL_OBJ) this funciton should return the vector, of the image, comparable to text. write this funciton as well.
+    }
+    tensor = [] # make tensor from tensor_Description
+    return tensor
+def vectorize_pdfs(pdf_dict):
+    """
+    Vectorize a pdf using openai API
+    Parameters:
+    - dataset: dictionary containing PDF files.
+    Returns:
+    - dictionary containing vectors
+    """
+    vec_dataset = {}
+    for key in pdf_dict.keys():
+        try:
+            vector = my_1_openai.vectorize_data(pdf_dict[key])
+        except openai.error.InvalidRequestError as err:
+            print(err)
+            vector = [0, 0, 0]
+        vec_dataset[key] = str(vector)
+    return vec_dataset
+def vectorize_csv(csv_table, safe=False):
+    folder_name = ""
+    if safe:
+        folder_name = f"{csv_table}_vectorised/"
+        if not os.path.exists(folder_name):
+            os.makedirs(folder_name)
+    nb = 1
+    vec_dataset = []
+    for data_item in csv_table:
+        vector = my_1_openai.vectorize_data(data_item)
+        if safe:
+            with open(f"{folder_name}{csv_table}_vec.txt", "w") as f:
+                f.write(str(vector) + "\n")
+        print("csv_line"+str(nb))
+        nb += 1
+        vec_dataset.append(str(vector))
+    return vec_dataset
+def create_df(ds):
+    # my_df = {"name": [], "metadata": [], "body_text": []}
+    my_df, my_new_df = {}, {}
+    my_df["name"] = [filename for filename in os.listdir(ds) if filename.endswith('.pdf')]
+    my_df["metadata"] = my_1_reader.read_csv_lines_as_strings(ds + "_metadata.csv")[1:11]
+    my_df["text"] = list(my_1_reader.read_pdfs_from_folder(ds).values())
+    for e in my_df:
+        my_new_df[f"{e}_vec"] = [my_1_openai.vectorize_data(item) for item in my_df[e]]
+    for e in my_new_df:
+        my_df[str(e)] = my_new_df[e]
+    for e in my_df:
+        print(f"{e}     {my_df[e][2]}")
+def create_vec_dataset(folder):
+    my_pdfs = my_1_reader.read_pdfs_from_folder(f"{folder}/PDF")
+    vectorize_then_safe_data(f"{folder}/vectors//names.json", my_pdfs.keys())
+    vectorize_then_safe_data(f"{folder}/vectors//texts.json", my_pdfs.values())
+# function to vectorize data=[]. then safes as json.
+def vectorize_then_safe_data(file_name, data):
+    my_vec_words = []
+    for entry in data:
+        my_vec_words.append(my_1_openai.vectorize_data(entry))
+    my_dict = dict(zip(data, my_vec_words))
+    my_1_writer.safe_my_dict_as_json(file_name, my_dict)
+    print("vectorised data saved")
+def main():
+    # Example call to the function
+    pdf_path = 'DS_U3/Dokumente/E - Elektroanlagen/ISB-020-U3-W-E-01-B07005-001-040.pdf'
+    output_path = 'DS_U3/Dokumente_vec'
+    metadata_filename = 'DS_U3/U3_Metadaten.csv'
+    vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename)
+if __name__ == "__main__":
+    print("this file contains embedding functions")
+    vec1 = vectorize_data("this is the test string")
+    vec2 = vectorize_data("this is the test string")
+    if vec1 == vec2:
+        print("same")

my_2_sim_search.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import my_1_openai
+import my_1_writer
+import json
+import numpy as np
+# sim search with dot_product and lin_distance
+# the newly vectorized TERM will be added to the database
+def sim_search(database, term, add_to_db=True, debug=False):
+    if type(term) == str:
+        print("str")
+        vector1 = my_1_openai.vectorize_data(term)
+    elif type(term) == list:
+        print("list")
+        vector1 = term
+    else:
+        print("invalid search_term/search_vector format")
+        return
+    with open(database, "r") as f:
+        table = json.load(f)
+    sim_search_dict = {}
+    for key in table.keys():
+        vector2 = table[key]
+        if debug:
+            print("")
+            print(f"{vector1}")
+            print(f"{vector2}")
+            print(f"doing dot product for {key} and {term}")
+        dp = np.dot(vector1, vector2)
+        distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
+        if debug:
+            print(f"the dp is {dp}")
+            print(f"the distance is{distance}")
+            print("")
+            print("")
+            print("")
+        sim_search_dict[key] = dp * distance
+    # sort with the biggest similarity
+    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
+    if debug:
+        for key, value in sorted_table[:5]:
+            print(f"{key}: {value}")
+    if add_to_db:
+        if term in table.keys():
+            print("the search term is in the database!")
+            # add the newly vectorized term to the words, if not already in the vector table
+        else:
+            if database != "session/my_words_vec_table.json":
+                database = "session/my_vecs.json"
+                # table = load_df(database)  # ??
+            table[str(term)] = vector1
+            my_1_writer.safe_my_dict_as_json(database, table)
+    # first_key, first_value = list(sortedTable.items())[0]
+    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
+    return sorted_table
+def dot_p_to_1(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    dot_product_to1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        dot_product_to1[key] = np.dot(vector1, table[key])
+    my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
+    print("dot p to 1 saved")
+def lin_dist(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    lin_dist_to_1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
+    my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
+    print("lin dist to 1 saved")
+def manhattan_dist(database, vector1=0, analysis_filename=0):
+    with open(database, "r") as f:
+        table = json.load(f)
+    manhattan_dist_to_1 = {}
+    if vector1 == 0:
+        vector1 = [0.025515518153991442 for _ in range(1536)]
+    elif vector1 == 1:
+        vector1 = table[str(list(table.keys())[0])]
+    for key in table.keys():
+        manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
+    my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
+    print("manhattan dist to 1 saved")