File size: 6,736 Bytes
2bcc178 2e95501 2bcc178 964fedc 2bcc178 e3fbad5 2bcc178 39acce6 2bcc178 39acce6 2bcc178 39acce6 2bcc178 39acce6 2bcc178 e3fbad5 2bcc178 964fedc 2bcc178 94c0e24 2bcc178 81a28c0 94c0e24 81a28c0 f89ca16 81a28c0 7ebe974 94c0e24 81a28c0 94c0e24 81a28c0 2bcc178 94c0e24 2bcc178 4e10dac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
from huggingface_hub import InferenceClient
import nltk
import re
import requests
import os
api_key = os.getenv("HF_KEY")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
client = InferenceClient(api_key=api_key)
'''
def extract_product_info(text):
print(f'Extract function called!')
# Initialize result dictionary
result = {"brand": None, "model": None, "description": None, "price": None}
# Extract price separately using regex (to avoid confusion with brand name)
price_match = re.search(r'\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
print(f'price_match:{price_match}')
if price_match:
result["price"] = price_match.group().replace("$", "").replace(",", "").strip()
# Remove the price part from the text to prevent it from being included in the brand/model extraction
text = text.replace(price_match.group(), "").strip()
print(f'text:{text}')
# Tokenize the remaining text and tag parts of speech
tokens = nltk.word_tokenize(text)
print(f'tokens are:{tokens}')
pos_tags = nltk.pos_tag(tokens)
print(tokens, pos_tags)
# Extract brand and model (Proper Nouns + Alphanumeric patterns)
brand_parts = []
model_parts = []
description_parts = []
# First part: Extract brand and model info
for word, tag in pos_tags:
if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
if len(brand_parts) == 0: # Assume the first proper noun is the brand
brand_parts.append(word)
else: # Model number tends to follow the brand
model_parts.append(word)
else:
description_parts.append(word)
# Assign brand and model to result dictionary
if brand_parts:
result["brand"] = " ".join(brand_parts)
if model_parts:
result["model"] = " ".join(model_parts)
# Combine the remaining parts as description
result["description"] = " ".join(description_parts)
print(f'extract function returned:\n{result}')
return result
'''
def extract_product_info(text):
print(f"Extract function called with input: {text}")
# Initialize result dictionary
result = {"brand": None, "model": None, "description": None, "price": None}
try:
# Extract price using regex
price_match = re.search(r'\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
print(f"Price match: {price_match}")
if price_match:
result["price"] = price_match.group().replace("$", "").replace(",", "").strip()
# Remove the price part from the text to prevent interference
text = text.replace(price_match.group(), "").strip()
print(f"Text after removing price: {text}")
# Tokenize the remaining text
try:
tokens = nltk.word_tokenize(text)
print(f"Tokens: {tokens}")
except Exception as e:
print(f"Error during tokenization: {e}")
# Fall back to a simple split if tokenization fails
tokens = text.split()
print(f"Fallback tokens: {tokens}")
# POS tagging
try:
pos_tags = nltk.pos_tag(tokens)
print(f"POS Tags: {pos_tags}")
except Exception as e:
print(f"Error during POS tagging: {e}")
# If POS tagging fails, create dummy tags
pos_tags = [(word, "NN") for word in tokens]
print(f"Fallback POS Tags: {pos_tags}")
# Extract brand, model, and description
brand_parts = []
model_parts = []
description_parts = []
for word, tag in pos_tags:
if tag == 'NNP' or re.match(r'[A-Za-z0-9-]+', word):
if len(brand_parts) == 0: # Assume the first proper noun is the brand
brand_parts.append(word)
else: # Model number tends to follow the brand
model_parts.append(word)
else:
description_parts.append(word)
# Assign values to the result dictionary
if brand_parts:
result["brand"] = " ".join(brand_parts)
if model_parts:
result["model"] = " ".join(model_parts)
if description_parts:
result["description"] = " ".join(description_parts)
print(f"Extract function returned: {result}")
except Exception as e:
print(f"Unexpected error: {e}")
# Return a fallback result in case of a critical error
result["description"] = text
print(f"Fallback result: {result}")
return result
def extract_info(text):
API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-large"
headers = {"Authorization": f"Bearer {api_key}"}
payload = {"inputs": f"From the given text, extract brand name, model number, description about it, and its average price in today's market. Give me back a python dictionary with keys as brand_name, model_number, desc, price. The text is {text}.",}
response = requests.post(API_URL, headers=headers, json=payload)
print('GOOGLEE LLM OUTPUTTTTTTT\n\n',response )
output = response.json()
print(output)
def get_name(url, object):
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Is this a {object}?. Can you guess what it is and give me the closest brand it resembles to? or a model number? And give me its average price in today's market in USD. In output, give me its normal name, model name, model number and price. separated by commas. No description is needed."
},
{
"type": "image_url",
"image_url": {
"url": url
}
}
]
}
]
completion = client.chat.completions.create(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
messages=messages,
max_tokens=500
)
print(f'\n\nNow output of LLM:\n')
llm_result = completion.choices[0].message['content']
print(llm_result)
# print(f'\n\nThat is the output')
print(f"Extracting from the output now, function calling")
result = extract_product_info(llm_result)
print(f'\n\nResult brand and price:{result}')
print(f'\n\nThat is the output')
# result2 = extract_info(llm_result)
# print(f'\n\nFrom Google llm:{result2}')
return result
# url = "https://i.ibb.co/mNYvqDL/crop_39.jpg"
# object="fridge"
# get_name(url, object) |