Spaces:
Running
Running
import gradio as gr | |
import requests | |
from duckduckgo_search import DDGS | |
import itertools | |
import time | |
from langchain.prompts import PromptTemplate | |
from langchain_huggingface import HuggingFaceEndpoint | |
from langchain_core.output_parsers import JsonOutputParser | |
from langdetect import detect | |
# Fetch proxy list from GitHub | |
def get_proxies(): | |
url = "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt" | |
response = requests.get(url) | |
proxies = response.text.splitlines() | |
return proxies | |
# Proxy cycle for rotation | |
proxy_list = get_proxies() | |
proxy_cycle = itertools.cycle(proxy_list) | |
# Proxy-enabled DDGS | |
class ProxiedDDGS(DDGS): | |
def __init__(self, proxy): | |
super().__init__() | |
self.proxy = proxy | |
def _get(self, url, headers=None): | |
response = requests.get( | |
url, headers=headers, proxies={"http": self.proxy, "https": self.proxy} | |
) | |
response.raise_for_status() | |
return response | |
# Search function with retries | |
def search_with_retries(query, max_results=3, max_retries=5, backoff_factor=1): | |
retries = 0 | |
while retries < max_retries: | |
try: | |
proxy = next(proxy_cycle) | |
searcher = ProxiedDDGS(proxy) | |
results = searcher.text(query, max_results=max_results) | |
return results, proxy | |
except Exception: | |
retries += 1 | |
time.sleep(backoff_factor * retries) | |
raise RuntimeError(f"All retries failed for query: {query}") | |
# Initialize the LLM | |
llm = HuggingFaceEndpoint( | |
repo_id="mistralai/Mistral-7B-Instruct-v0.3", | |
task="text-generation", | |
max_new_tokens=128, | |
temperature=0.7, | |
do_sample=False, | |
) | |
# Prompt template for feature extraction | |
template_extract_features = ''' | |
You are a product feature extractor bot. Your task is to determine features like Brand, Model, Type, RAM, Storage, etc., from the given product description and web search results. | |
Given product description: {TEXT} | |
Relevant web search results: | |
{SEARCH_RESULTS} | |
Return features in JSON format with keys like Brand, Model, Type, RAM, Storage, and others. | |
Your response MUST only include a valid JSON object and nothing else. | |
Example: | |
{{ | |
"Brand": "Apple", | |
"Model": "iPhone 14", | |
"Type": "Smartphone", | |
"RAM": "4GB", | |
"Storage": "128GB" | |
}} | |
''' | |
json_output_parser = JsonOutputParser() | |
# Define the classify_text function | |
def extract_features(description): | |
global llm | |
start = time.time() | |
try: | |
lang = detect(description) | |
except: | |
lang = "en" | |
# Perform web search | |
try: | |
search_results, _ = search_with_retries(description, max_results=3) | |
search_text = "\n".join([res.get('snippet', '') for res in search_results]) | |
except RuntimeError as e: | |
search_text = "No search results available." | |
# Format the prompt | |
prompt_extract = PromptTemplate( | |
template=template_extract_features, | |
input_variables=["TEXT", "SEARCH_RESULTS"] | |
) | |
formatted_prompt = prompt_extract.format(TEXT=description, SEARCH_RESULTS=search_text) | |
# LLM response | |
response = llm.invoke(formatted_prompt) | |
parsed_output = json_output_parser.parse(response) | |
end = time.time() | |
return lang, parsed_output, end - start | |
# Create the Gradio interface | |
def create_gradio_interface(): | |
with gr.Blocks() as iface: | |
text_input = gr.Textbox(label="Item Description") | |
lang_output = gr.Textbox(label="Detected Language") | |
feature_output = gr.Textbox(label="Extracted Features (JSON)") | |
time_taken = gr.Textbox(label="Time Taken (seconds)") | |
submit_btn = gr.Button("Extract Features") | |
def on_submit(text): | |
lang, features, duration = extract_features(text) | |
return lang, features, f"{duration:.2f} seconds" | |
submit_btn.click(fn=on_submit, inputs=text_input, outputs=[lang_output, feature_output, time_taken]) | |
iface.launch() | |
if __name__ == "__main__": | |
create_gradio_interface() | |