import gradio as gr import requests from duckduckgo_search import DDGS import itertools import time from langchain.prompts import PromptTemplate from langchain_huggingface import HuggingFaceEndpoint from langchain_core.output_parsers import JsonOutputParser from langdetect import detect # Fetch proxy list from GitHub def get_proxies(): url = "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt" response = requests.get(url) proxies = response.text.splitlines() return proxies # Proxy cycle for rotation proxy_list = get_proxies() proxy_cycle = itertools.cycle(proxy_list) # Proxy-enabled DDGS class ProxiedDDGS(DDGS): def __init__(self, proxy): super().__init__() self.proxy = proxy def _get(self, url, headers=None): response = requests.get( url, headers=headers, proxies={"http": self.proxy, "https": self.proxy} ) response.raise_for_status() return response # Search function with retries def search_with_retries(query, max_results=3, max_retries=5, backoff_factor=1): retries = 0 while retries < max_retries: try: proxy = next(proxy_cycle) searcher = ProxiedDDGS(proxy) results = searcher.text(query, max_results=max_results) return results, proxy except Exception: retries += 1 time.sleep(backoff_factor * retries) raise RuntimeError(f"All retries failed for query: {query}") # Initialize the LLM llm = HuggingFaceEndpoint( repo_id="mistralai/Mistral-7B-Instruct-v0.3", task="text-generation", max_new_tokens=128, temperature=0.7, do_sample=False, ) # Prompt template for feature extraction template_extract_features = ''' You are a product feature extractor bot. Your task is to determine features like Brand, Model, Type, RAM, Storage, etc., from the given product description and web search results. Return features in JSON format with keys like Brand, Model, Type, RAM, Storage, and others. Your response MUST only include a valid JSON object and nothing else. Example: {{ "Brand": "Apple", "Model": "iPhone 14", "Type": "Smartphone", "RAM": "4GB", "Storage": "128GB" }} Answer with JSON for the following: Given product description and web search results: {TEXT} {SEARCH_RESULTS} ''' json_output_parser = JsonOutputParser() # Define the classify_text function def extract_features(description): global llm start = time.time() try: lang = detect(description) except: lang = "en" # Perform web search try: search_results, _ = search_with_retries(description, max_results=3) search_text = "\n".join([res.get('snippet', '') for res in search_results]) except RuntimeError as e: search_text = "No search results available." # Format the prompt prompt_extract = PromptTemplate( template=template_extract_features, input_variables=["TEXT", "SEARCH_RESULTS"] ) formatted_prompt = prompt_extract.format(TEXT=description, SEARCH_RESULTS=search_text) # LLM response response = llm.invoke(formatted_prompt) parsed_output = json_output_parser.parse(response) end = time.time() return lang, parsed_output, end - start # Create the Gradio interface def create_gradio_interface(): with gr.Blocks() as iface: text_input = gr.Textbox(label="Item Description") lang_output = gr.Textbox(label="Detected Language") feature_output = gr.Textbox(label="Extracted Features (JSON)") time_taken = gr.Textbox(label="Time Taken (seconds)") submit_btn = gr.Button("Extract Features") def on_submit(text): lang, features, duration = extract_features(text) return lang, features, f"{duration:.2f} seconds" submit_btn.click(fn=on_submit, inputs=text_input, outputs=[lang_output, feature_output, time_taken]) iface.launch() if __name__ == "__main__": create_gradio_interface()