File size: 6,736 Bytes
2bcc178
 
 
 
 
 
 
 
 
 
2e95501
2bcc178
 
 
 
964fedc
2bcc178
e3fbad5
2bcc178
 
 
 
 
39acce6
2bcc178
 
 
 
39acce6
2bcc178
 
39acce6
2bcc178
39acce6
2bcc178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3fbad5
2bcc178
964fedc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bcc178
 
 
 
 
 
 
 
 
 
 
 
 
94c0e24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bcc178
81a28c0
 
 
94c0e24
81a28c0
f89ca16
 
81a28c0
7ebe974
94c0e24
81a28c0
 
94c0e24
81a28c0
2bcc178
94c0e24
2bcc178
 
 
4e10dac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from huggingface_hub import InferenceClient
import nltk
import re 
import requests
import os 

api_key = os.getenv("HF_KEY")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


client = InferenceClient(api_key=api_key)

'''
def extract_product_info(text):
    print(f'Extract function called!')
    # Initialize result dictionary
    result = {"brand": None, "model": None, "description": None, "price": None}

    # Extract price separately using regex (to avoid confusion with brand name)
    price_match = re.search(r'\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
    print(f'price_match:{price_match}')
    if price_match:
        result["price"] = price_match.group().replace("$", "").replace(",", "").strip()
        # Remove the price part from the text to prevent it from being included in the brand/model extraction
        text = text.replace(price_match.group(), "").strip()
    print(f'text:{text}')
    # Tokenize the remaining text and tag parts of speech
    tokens = nltk.word_tokenize(text)
    print(f'tokens are:{tokens}')
    pos_tags = nltk.pos_tag(tokens)
    print(tokens, pos_tags)

    # Extract brand and model (Proper Nouns + Alphanumeric patterns)
    brand_parts = []
    model_parts = []
    description_parts = []
    
    # First part: Extract brand and model info
    for word, tag in pos_tags:
        if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
            if len(brand_parts) == 0:  # Assume the first proper noun is the brand
                brand_parts.append(word)
            else:  # Model number tends to follow the brand
                model_parts.append(word)
        else:
            description_parts.append(word)
    
    # Assign brand and model to result dictionary
    if brand_parts:
        result["brand"] = " ".join(brand_parts)
    if model_parts:
        result["model"] = " ".join(model_parts)
    
    # Combine the remaining parts as description
    result["description"] = " ".join(description_parts)
    print(f'extract function returned:\n{result}')
    return result
'''
def extract_product_info(text):
    print(f"Extract function called with input: {text}")
    
    # Initialize result dictionary
    result = {"brand": None, "model": None, "description": None, "price": None}
    
    try:
        # Extract price using regex
        price_match = re.search(r'\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
        print(f"Price match: {price_match}")
        if price_match:
            result["price"] = price_match.group().replace("$", "").replace(",", "").strip()
            # Remove the price part from the text to prevent interference
            text = text.replace(price_match.group(), "").strip()
        print(f"Text after removing price: {text}")
        
        # Tokenize the remaining text
        try:
            tokens = nltk.word_tokenize(text)
            print(f"Tokens: {tokens}")
        except Exception as e:
            print(f"Error during tokenization: {e}")
            # Fall back to a simple split if tokenization fails
            tokens = text.split()
            print(f"Fallback tokens: {tokens}")
        
        # POS tagging
        try:
            pos_tags = nltk.pos_tag(tokens)
            print(f"POS Tags: {pos_tags}")
        except Exception as e:
            print(f"Error during POS tagging: {e}")
            # If POS tagging fails, create dummy tags
            pos_tags = [(word, "NN") for word in tokens]
            print(f"Fallback POS Tags: {pos_tags}")
        
        # Extract brand, model, and description
        brand_parts = []
        model_parts = []
        description_parts = []
        
        for word, tag in pos_tags:
            if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
                if len(brand_parts) == 0:  # Assume the first proper noun is the brand
                    brand_parts.append(word)
                else:  # Model number tends to follow the brand
                    model_parts.append(word)
            else:
                description_parts.append(word)
        
        # Assign values to the result dictionary
        if brand_parts:
            result["brand"] = " ".join(brand_parts)
        if model_parts:
            result["model"] = " ".join(model_parts)
        if description_parts:
            result["description"] = " ".join(description_parts)
        
        print(f"Extract function returned: {result}")
    
    except Exception as e:
        print(f"Unexpected error: {e}")
        # Return a fallback result in case of a critical error
        result["description"] = text
        print(f"Fallback result: {result}")
    
    return result


def extract_info(text):
    API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
    headers = {"Authorization": f"Bearer {api_key}"}
    payload = {"inputs": f"From the given text, extract brand name, model number, description about it, and its average price in today's market. Give me back a python dictionary with keys as brand_name, model_number, desc, price. The text is {text}.",}
    response = requests.post(API_URL, headers=headers, json=payload)
    print('GOOGLEE LLM OUTPUTTTTTTT\n\n',response )
    output = response.json()
    print(output)


def get_name(url, object):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Is this a {object}?. Can you guess what it is and give me the closest brand it resembles to? or a model number? And give me its average price in today's market in USD. In output, give me its normal name, model name, model number and price. separated by commas. No description is needed." 
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url
                    }
                }
            ]
        }
    ]

    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct", 
        messages=messages, 
        max_tokens=500
    )

    print(f'\n\nNow output of LLM:\n')
    llm_result = completion.choices[0].message['content']
    print(llm_result)

    # print(f'\n\nThat is the output')
    print(f"Extracting from the output now, function calling")
    result = extract_product_info(llm_result)
    print(f'\n\nResult brand and price:{result}')
    print(f'\n\nThat is the output')

    # result2 = extract_info(llm_result)
    # print(f'\n\nFrom Google llm:{result2}')

    return result


# url = "https://i.ibb.co/mNYvqDL/crop_39.jpg"
# object="fridge"

# get_name(url, object)