import gradio as gr import requests from duckduckgo_search import DDGS import itertools import time from langchain.prompts import PromptTemplate from langchain_huggingface import HuggingFaceEndpoint from langchain_core.output_parsers import JsonOutputParser from langdetect import detect # Fetch proxy list from GitHub def get_proxies(): url = "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt" response = requests.get(url) proxies = response.text.splitlines() return proxies # Proxy cycle for rotation proxy_list = get_proxies() proxy_cycle = itertools.cycle(proxy_list) # Proxy-enabled DDGS class ProxiedDDGS(DDGS): def __init__(self, proxy): super().__init__() self.proxy = proxy def _get(self, url, headers=None): response = requests.get( url, headers=headers, proxies={"http": self.proxy, "https": self.proxy} ) response.raise_for_status() return response # Search function with retries def search_with_retries(query, max_results=3, max_retries=5, backoff_factor=1): retries = 0 while retries < max_retries: try: proxy = next(proxy_cycle) searcher = ProxiedDDGS(proxy) results = searcher.text(query, max_results=max_results) return results, proxy except Exception: retries += 1 time.sleep(backoff_factor * retries) raise RuntimeError(f"All retries failed for query: {query}") # Initialize the LLM llm = HuggingFaceEndpoint( repo_id="mistralai/Mistral-7B-Instruct-v0.3", task="text-generation", max_new_tokens=128, temperature=0.7, do_sample=False, ) # Prompt template for feature extraction template_extract_features = ''' You are a product feature extractor bot. Your task is to determine features like Brand, Model, Type, RAM, Storage, etc., from the given product description and web search results. Given product description: {TEXT} Relevant web search results: {SEARCH_RESULTS} Return features in JSON format with keys like Brand, Model, Type, RAM, Storage, and others. Your response MUST only include a valid JSON object and nothing else. Example: {{ "Brand": "Apple", "Model": "iPhone 14", "Type": "Smartphone", "RAM": "4GB", "Storage": "128GB" }} ''' json_output_parser = JsonOutputParser() # Define the classify_text function def extract_features(description): global llm start = time.time() try: lang = detect(description) except: lang = "en" # Perform web search try: search_results, _ = search_with_retries(description, max_results=3) search_text = "\n".join([res.get('snippet', '') for res in search_results]) except RuntimeError as e: search_text = "No search results available." # Format the prompt prompt_extract = PromptTemplate( template=template_extract_features, input_variables=["TEXT", "SEARCH_RESULTS"] ) formatted_prompt = prompt_extract.format(TEXT=description, SEARCH_RESULTS=search_text) # LLM response response = llm.invoke(formatted_prompt) parsed_output = json_output_parser.parse(response) end = time.time() return lang, parsed_output, end - start # Create the Gradio interface def create_gradio_interface(): with gr.Blocks() as iface: text_input = gr.Textbox(label="Item Description") lang_output = gr.Textbox(label="Detected Language") feature_output = gr.Textbox(label="Extracted Features (JSON)") time_taken = gr.Textbox(label="Time Taken (seconds)") submit_btn = gr.Button("Extract Features") def on_submit(text): lang, features, duration = extract_features(text) return lang, features, f"{duration:.2f} seconds" submit_btn.click(fn=on_submit, inputs=text_input, outputs=[lang_output, feature_output, time_taken]) iface.launch() if __name__ == "__main__": create_gradio_interface()