File size: 5,174 Bytes
f66e562
70a0bfa
 
 
 
 
 
 
35a2d79
70a0bfa
 
 
 
 
 
 
15511a5
70a0bfa
f66e562
70a0bfa
 
 
 
f66e562
70a0bfa
 
 
 
 
 
 
 
f66e562
70a0bfa
 
 
 
 
 
 
57e49c3
70a0bfa
 
 
 
 
 
 
 
 
 
 
 
 
f66e562
70a0bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f66e562
70a0bfa
 
 
 
f66e562
70a0bfa
f66e562
 
70a0bfa
f66e562
70a0bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2276fb1
4568aa7
 
70a0bfa
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Import all necessary libraries and don't forget to check out Dependencies
import streamlit as st
from PIL import Image
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import pyperclip
import random
import easyocr
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 

# Load the model-pretrained
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Function to generate captions
def generate_captions(image):
    image = Image.open(image).convert("RGB")
    generated_caption = tokenizer.decode(model.generate(feature_extractor(image, return_tensors="pt").pixel_values.to("cpu"))[0])
    sentence = generated_caption
    text_to_remove = "<|endoftext|>"
    generated_caption = sentence.replace(text_to_remove, "")
    return generated_caption

#  kinda-Function easyocr to extract text from the image
def image_text(image):
    img_np = np.array(image)
    reader = easyocr.Reader(['en'])
    text = reader.readtext(img_np)
    detected_text = " ".join([item[1] for item in text])

    # Extract individual words, convert to lowercase, and add "#" symbol
    detected_text= ['#' + entry[1].strip().lower().replace(" ", "") for entry in text]
    return detected_text

# Load NLTK stopwords for filtering
stop_words = set(stopwords.words('english'))

# Add hashtags to keywords, which have been generated from image captioing
def add_hashtags(keywords):
    hashtags = []
    for keyword in keywords:
        hashtag = '#' + keyword.lower()
        hashtags.append(hashtag)
    return hashtags

# function to get and add trending Hashtags
def trending_hashtags(caption):
  with open("hashies.txt", "r") as file:
      hashtags_string = file.read()

  # Split the hashtags by commas and remove any leading/trailing spaces
  trending_hashtags = [hashtag.strip() for hashtag in hashtags_string.split(',')]

  # Create a DataFrame from the hashtags
  df = pd.DataFrame(trending_hashtags, columns=["Hashtags"])

  # Function to extract keywords from a given text
  def extract_keywords(caption):
      tokens = word_tokenize(caption)
      keywords = [token.lower() for token in tokens if token.lower() not in stop_words]
      return keywords

  # Extract keywords from caption and trending hashtags
  caption_keywords = extract_keywords(caption)
  hashtag_keywords = [extract_keywords(hashtag) for hashtag in df["Hashtags"]]

  # Function to calculate cosine similarity between two strings
  def calculate_similarity(text1, text2):
      tfidf_vectorizer = TfidfVectorizer()
      tfidf_matrix = tfidf_vectorizer.fit_transform([text1, text2])
      similarity_matrix = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
      return similarity_matrix[0][0]

  # Calculate similarity between caption and each trending hashtag
  similarities = [calculate_similarity(' '.join(caption_keywords), ' '.join(keywords)) for keywords in hashtag_keywords]

  # Sort trending hashtags based on similarity in descending order
  sorted_hashtags = [hashtag for _, hashtag in sorted(zip(similarities, df["Hashtags"]), reverse=True)]

  # Select top k relevant hashtags (e.g., top 5) without duplicates and return them
  selected_hashtags = list(set(sorted_hashtags[:5]))
  selected_hashtag = [word.strip("'") for word in selected_hashtags]
  return selected_hashtag

# Streamlit app Creation
def app():
    st.title('Have a :green[Bueatiful pic!] Looking for :orange[Trending Hashtags to post it on your social handle?]. Here is some Help')
    
    # create file uploader
    uploaded_file = st.file_uploader("Upload Picture of your wish!, :violet[magic on the Way! ]", type=["jpg", "jpeg", "png"])

    # check if file has been uploaded
    if uploaded_file is not None:
        # load the image
        image = Image.open(uploaded_file).convert("RGB")

        # Image Captions
        string = generate_captions(uploaded_file)
        tokens = word_tokenize(string)
        keywords = [token.lower() for token in tokens if token.lower() not in stop_words]
        hashtags = add_hashtags(keywords)

        # Text Captions from image
        extracted_text = image_text(image)

        #Final Hashtags Generation
        web_hashtags = trending_hashtags(string)
        combined_hashtags = hashtags + extracted_text + web_hashtags

        # Shuffle the list randomly
        random.shuffle(combined_hashtags)
        combined_hashtags = list(set(item for item in combined_hashtags[:15] if not re.search(r'\d$', item)))

        # display the image
        st.image(image, caption='The Uploaded File')
        all = "\n ".join(combined_hashtags)
        st.write("Magical hashies have arrived* :sparkles: ")
        st.write(all)

# run the app
if __name__ == '__main__':
    app()