Spaces:

hprasath
/

image-processing

Sleeping

App Files Files Community

hprasath commited on Apr 7, 2024

Commit

bbcc5b2

verified ·

1 Parent(s): 64c6ed3

Upload 9 files

Browse files

Files changed (9) hide show

utils/ImageAndTextEmbedding/index.py +40 -0
utils/audioEmbedding/index.py +28 -0
utils/imageEmbedding/index.py +17 -0
utils/imageToText/index.py +24 -0
utils/objectDetection/index.py +12 -0
utils/sample.py +77 -0
utils/sentanceEmbedding/index.py +32 -0
utils/similarityScore.py +41 -0
utils/videoEmbedding/index.py +43 -0

utils/ImageAndTextEmbedding/index.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from PIL import Image
+import io
+from transformers import AutoTokenizer, CLIPProcessor, CLIPModel
+import torch
+# Load CLIP model and processor
+model_name = "openai/clip-vit-base-patch32"
+loaded_model = CLIPModel.from_pretrained(model_name)
+loaded_processor = CLIPProcessor.from_pretrained(model_name)
+def getTextEmbedding(text):
+    # Preprocess the text
+    print("tear")
+    inputs_text = loaded_processor(text=[text], return_tensors="pt", padding=True)
+    print("here")
+    # Forward pass through the model
+    with torch.no_grad():
+        # Get the text features
+        text_features = loaded_model.get_text_features(input_ids=inputs_text.input_ids, attention_mask=inputs_text.attention_mask)
+    print("bear")
+    # Convert tensor to numpy array for better readability
+    text_embedding = text_features.squeeze().numpy()
+    print("done")
+    return text_embedding
+def getImageEmbedding(binary_image_data):
+    # Load and preprocess the image
+    image = Image.open(io.BytesIO(binary_image_data))
+    inputs = loaded_processor(images=image, return_tensors="pt", padding=True)
+    # Forward pass through the model
+    with torch.no_grad():
+        # Get the image features
+        image_features = loaded_model.get_image_features(pixel_values=inputs.pixel_values)
+    # Convert tensor to numpy array for better readability
+    image_embedding = image_features.squeeze().numpy()
+    return image_embedding

utils/audioEmbedding/index.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import tensorflow as tf
+import numpy as np
+import librosa
+import pickle
+import io
+# Load the YAMNet model from the SavedModel format
+yamnet_model = tf.saved_model.load('yamnet_saved_model')
+# Function to extract embeddings from audio file using YAMNet
+def extract_audio_embeddings(audio_binary):
+    # Load audio from binary data using librosa
+    audio, sample_rate = librosa.load(io.BytesIO(audio_binary), sr=16000)  # YAMNet requires a sample rate of 16kHz
+    # Convert audio to float32 tensor
+    audio_tensor = tf.convert_to_tensor(audio, dtype=tf.float32)
+    # Extract embeddings using YAMNet model
+    scores, embeddings, spectrogram = yamnet_model(audio_tensor)
+    embeddings_list = embeddings.numpy().tolist()  # Convert embeddings to a list of lists
+    return embeddings_list
+# Example usage
+if __name__ == "__main__":
+    image_audio_path = "pictures/users/1a.mp3"
+    # Extract embeddings from image audio file
+    image_audio_embeddings = extract_audio_embeddings(image_audio_path)
+    print("Embeddings for", image_audio_path)
+    print(image_audio_embeddings)
+print("audio embedding model loaded succesfully")

utils/imageEmbedding/index.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import pickle
+from torchvision import transforms
+from PIL import Image
+import torch
+import io
+from utils.ImageAndTextEmbedding.index import getImageEmbedding
+def get_image_embedding(image_bytes):
+    print("comming 1")
+    return getImageEmbedding(image_bytes)
+# Example: Load image data from file and get its embedding
+# image_data = open("pictures/users/2.jpg", "rb").read()
+# embedding = get_image_embedding(image_data)
+# print(embedding)
+print("Image embedding model loaded successfully!")

utils/imageToText/index.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pickle
+import re
+from PIL import Image
+from transformers import pipeline
+import io
+def clean_text(text):
+    clean_text = re.sub(r'<[^>]+>', '', text)
+    clean_text = clean_text.strip()
+    clean_text = re.sub(r'\s+', ' ', clean_text)
+    return clean_text
+pipe = pipeline("image-to-text", model="jinhybr/OCR-Donut-CORD")
+def extract_text(binary_image):
+    image = Image.open(io.BytesIO(binary_image))
+    result = pipe(image)
+    text = result[0]['generated_text']
+    cleaned_text = clean_text(text)
+    return cleaned_text
+# print(extract_text(open("pictures/users/2.jpg", "rb").read()))
+print("OCR pipeline loaded successfully!")

utils/objectDetection/index.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import pipeline
+from PIL import Image
+from io import BytesIO
+# Load the object detection pipeline
+object_detection_pipeline = pipeline("object-detection", model="ciasimbaya/ObjectDetection")
+def detect_objects(image_bytes):
+    image = Image.open(BytesIO(image_bytes))
+    result = object_detection_pipeline(image)
+    return result
+print("object detection model loaded succesfully")

utils/sample.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import requests
+# Define the image URL
+image_url = "https://utfs.io/f/47589c6c-6ce0-4baf-b75d-b1ec5d4d9dda-213j1w.jpg"
+audio_url = "https://utfs.io/f/b84a84a2-b68f-49c5-8b7c-d76d894f6d3a-c5qjj4.wav"
+video_url = "https://utfs.io/f/ef6c037f-fa61-471a-8956-562bc2d62531-fzxs1i.mp4"
+family_url = "https://i.pinimg.com/originals/b2/20/14/b22014ca275e94097386aab222469caf.jpg"
+# Define the URLs of the three nodes
+extract_text_url = "http://127.0.0.1:5000/extractText"
+extract_audio_text_url = "http://127.0.0.1:5000/extractAudioText"
+get_image_embedding_url = "http://127.0.0.1:5000/getImageEmbedding"
+get_text_embedding_url = "http://127.0.0.1:5000/getTextEmbedding"
+get_text_description_embedding_url  = "http://127.0.0.1:5000/getTextDescriptionEmbedding"
+get_audio_embedding_url = "http://127.0.0.1:5000/getAudioEmbedding"
+get_audio_extracted_text_url = "http://127.0.0.1:5000/getAudioExtractedText"
+get_video_embedding_url = "http://127.0.0.1:5000/getVideoEmbedding"
+get_object_detection_url = "http://127.0.0.1:5000/detectObjects"
+get_similarity_score_url = "http://127.0.0.1:5000/getSimilarityScore"
+get_face_locations_url = "http://127.0.0.1:5000/getFaceLocations"
+# Make requests to each node with the image URL
+try:
+    list=[]
+    response_text = requests.post(extract_audio_text_url, json={"audio_url": audio_url})
+    extracted_text = response_text.json()["transcription"]
+    list.append({"length of text":len(extracted_text)})
+    # # Request to extract text
+    # response_text = requests.post(extract_text_url, json={"imageUrl": image_url})
+    # extracted_text = response_text.json().get("extracted_text")
+    # list.append({"length of text":len(extracted_text)})
+    # # Request to get image embedding
+    # response_image_embedding = requests.post(get_image_embedding_url, json={"imageUrl": image_url})
+    # image_embedding = response_image_embedding.json().get("image_embedding")
+    # list.append({"length of image_embedding":len(image_embedding)})
+    # # Request to get text embedding
+    # response_text_embedding = requests.post(get_text_embedding_url, json={"text": extracted_text})
+    # text_embedding = response_text_embedding.json().get("text_embedding")
+    # list.append({"length of text_embedding":len(text_embedding)})
+    # # Request to get text description embedding
+    # response_text_description_embedding = requests.post(get_text_description_embedding_url, json={"text": "a image of mobile phone"})
+    # text_description_embedding = response_text_description_embedding.json().get("text_description_embedding")
+    # list.append({"length of text_description_embedding":len(text_description_embedding)})
+    # # Request to get audio embedding
+    # response_audio_embedding = requests.post(get_audio_embedding_url, json={"audioUrl": audio_url})
+    # audio_embedding = response_audio_embedding.json().get("audio_embedding")
+    # list.append({"length of audio_embedding":len(audio_embedding)})
+    # Request to get video embedding
+    response_video_embedding = requests.post(get_video_embedding_url, json={"videoUrl": video_url})
+    video_embedding = response_video_embedding.json().get("video_embedding")
+    list.append({"length of video_embedding":(video_embedding)})
+    # # Request to get object detection
+    # response_object_detection = requests.post(get_object_detection_url, json={"imageUrl": image_url})
+    # object_detection = response_object_detection.json().get("object_detection_results")
+    # list.append({"length of object_detection":len(object_detection)})
+    # # Request to get similarity score
+    # response_similarity_score = requests.post(get_similarity_score_url, json={"embedding1": text_description_embedding, "embedding2": image_embedding})
+    # similarity_score = response_similarity_score.json().get("similarity_score")
+    # list.append({"similarity_score":similarity_score})
+    # # Request to get face locations
+    # response_face_locations = requests.post(get_face_locations_url, json={"imageUrl": family_url})
+    # face_locations = response_face_locations.json().get("face_locations")
+    # list.append({"face_locations":face_locations})
+    print(list)
+except Exception as e:
+    print("Error:", e)

utils/sentanceEmbedding/index.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import pickle
+from utils.ImageAndTextEmbedding.index import getTextEmbedding
+with open("word2vec_model.pkl", "rb") as f:
+    textEmbedding_model = pickle.load(f)
+def get_text_vector(example_text):
+    # Tokenize the text into words
+    words = example_text.lower().split()
+    # Filter out words that are not in the vocabulary of the Word2Vec model
+    words_in_vocab = [word for word in words if word in textEmbedding_model]
+    # Calculate the average vector representation of the words
+    if words_in_vocab:
+        text_vector = sum(textEmbedding_model[word] for word in words_in_vocab) / len(words_in_vocab)
+        return text_vector.tolist()
+    else:
+        return None
+def get_text_discription_vector(text):
+    return getTextEmbedding(text)
+# Example usage:
+# example_text = "This is an example sentence."
+# text_vector = get_text_vector(example_text)
+# if text_vector:
+#     print("Vector representation of the example text:", text_vector)
+# else:
+#     print("None of the words in the example text are in the vocabulary of the Word2Vec model.")
+print("Text embedding model loaded successfully!")

utils/similarityScore.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+def euclidean_similarity(embedding1, embedding2):
+    embedding1 = np.array(embedding1)
+    embedding2 = np.array(embedding2)
+    euclidean_distance = np.linalg.norm(embedding1 - embedding2)
+    # Convert distance to similarity score
+    similarity_score = 1 / (1 + euclidean_distance)  # You can use other transformations as well
+    return similarity_score
+def cosine_similarity(embedding1, embedding2):
+    dot_product = np.dot(embedding1, embedding2)
+    norm1 = np.linalg.norm(embedding1)
+    norm2 = np.linalg.norm(embedding2)
+    cosine_similarity = dot_product / (norm1 * norm2)
+    return cosine_similarity
+def jaccard_similarity(embedding1, embedding2):
+    intersection = len(set(embedding1).intersection(set(embedding2)))
+    union = len(set(embedding1).union(set(embedding2)))
+    return intersection / union
+def hamming_similarity(embedding1, embedding2):
+    distance = np.count_nonzero(embedding1 != embedding2)
+    similarity = 1 - distance / len(embedding1)
+    return similarity
+def get_all_similarities(embedding1, embedding2):
+    euclidean = euclidean_similarity(embedding1, embedding2)
+    cosine = cosine_similarity(embedding1, embedding2)
+    jaccard = jaccard_similarity(embedding1, embedding2)
+    hamming = hamming_similarity(embedding1, embedding2)
+    return {"euclidean": euclidean, "cosine": cosine, "jaccard": jaccard, "hamming": hamming}
+# Example usage:
+# embedding1 = [1, 2, 3]
+# embedding2 = [4, 5, 6]
+# similarities = get_all_similarities(embedding1, embedding2)
+# print(similarities)
+print("Similarity score is working")

utils/videoEmbedding/index.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import cv2
+import numpy as np
+from utils.imageEmbedding.index import get_image_embedding
+from utils.imageToText.index import extract_text
+def get_video_embedding(video_url):
+    try:
+        cap = cv2.VideoCapture(video_url)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        interval = int(fps)  # Capture a frame every second
+        frame_count = 0
+        video_embeddings = []
+        while(cap.isOpened()):
+            ret, frame = cap.read()
+            if ret:
+                if frame_count % interval == 0:
+                    # Convert frame to binary format
+                    ret, buffer = cv2.imencode('.jpg', frame)
+                    if not ret:
+                        continue
+                    # Convert frame binary data to bytes
+                    frame_bytes = buffer.tobytes()
+                    # Get image embedding for the frame
+                    extracted_text = extract_text(frame_bytes)
+                    image_embedding = get_image_embedding(frame_bytes)
+                    image_embedding_list = image_embedding.tolist()
+                    video_embeddings.append({"image_embedding": image_embedding_list ,"extracted_text":extracted_text})
+                frame_count += 1
+            else:
+                break
+        cap.release()
+        return video_embeddings
+    except Exception as e:
+        print(e)
+# Example usage:
+# video_url = "https://utfs.io/f/ef6c037f-fa61-471a-8956-562bc2d62531-fzxs1i.mp4"
+# video_embeddings = get_video_embedding(video_url)
+# print("Video Embeddings:", video_embeddings)