Spaces:

limitedonly41
/

website_classification

Runtime error

File size: 5,617 Bytes

fef6972
 
8677f63
d173154
69e9c2b
5cc7129
 
 
 
d173154
 
 
8677f63
fef6972
 
 
 
e34c82b
7a45750
 
e354a1c
 
 
fef6972
e34c82b
69e9c2b
 
 
 
 
 
 
 
e34c82b
69e9c2b
 
 
 
 
 
 
 
e34c82b
 
 
 
69e9c2b
 
e34c82b
 
69e9c2b
5cc7129
69e9c2b
 
5cc7129
69e9c2b
5cc7129
 
 
 
69e9c2b
e34c82b
 
 
 
 
 
69e9c2b
 
 
 
 
 
 
 
 
 
 
 
 
e34c82b
69e9c2b
 
 
 
 
 
 
 
 
 
 
 
e34c82b
69e9c2b
e34c82b
 
69e9c2b
 
 
e354a1c
b0da584
69e9c2b
09a7523
 
e34c82b
ced26d2
7f6c2f2
 
 
 
 
 
 
 
 
 
 
 
ced26d2
 
 
 
 
 
dd5374d
c786964
dd5374d
b18caf8
 
080569b
 
 
 
 
 
69e9c2b
080569b
5cc7129
b18caf8
 
 
 
5cc7129
b18caf8
 
 
 
 
 
5cc7129
b18caf8
d173154
b18caf8
 
 
fef6972
 
 
dd5374d
 
 
fef6972
dd5374d
fef6972
5cc7129
e34c82b
dd5374d

import gradio as gr
import torch
import spaces
import logging
from deep_translator import GoogleTranslator
import pandas as pd
from tqdm import tqdm
import urllib
from bs4 import BeautifulSoup

# Configure logging to write messages to a file
logging.basicConfig(filename='app.log', level=logging.ERROR)

# Configuration
max_seq_length = 2048
dtype = None  # Auto detection of dtype
load_in_4bit = True  # Use 4-bit quantization to reduce memory usage

# peft_model_name = "limitedonly41/website_qwen2_7b_2"
peft_model_name = "limitedonly41/website_mistral7b_v02"
# Initialize model and tokenizer variables
model = None
tokenizer = None

def fetch_data(url):
    headers = {
        'Accept': '*/*',
        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'Connection': 'keep-alive',
        'Referer': f'{url}',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    encoding = 'utf-8'
    timeout = 10  # Set your desired timeout value in seconds
    try:
        # Make the request using urllib
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=timeout) as response:
            response_content = response.read()

        soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)

        title = soup.find('title').text
        description = soup.find('meta', attrs={'name': 'description'})
        description = description.get("content") if description and "content" in description.attrs else ""

        keywords = soup.find('meta', attrs={'name': 'keywords'})
        keywords = keywords.get("content") if keywords and "content" in keywords.attrs else ""

        h1_all = ". ".join(h.text for h in soup.find_all('h1'))
        paragraphs_all = ". ".join(p.text for p in soup.find_all('p'))
        h2_all = ". ".join(h.text for h in soup.find_all('h2'))
        h3_all = ". ".join(h.text for h in soup.find_all('h3'))

        allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"[:4999]

        # Clean up the text
        h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
        h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
        h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')

        return {
            'url': url,
            'title': title,
            'description': description,
            'keywords': keywords,
            'h1': h1_all,
            'h2': h2_all,
            'h3': h3_all,
            'paragraphs': paragraphs_all,
            'text': allthecontent
        }
    except Exception as e:
        print(url, e)
        return {
            'url': url,
            'title': None,
            'description': None,
            'keywords': None,
            'h1': None,
            'h2': None,
            'h3': None,
            'paragraphs': None,
            'text': None
        }

def main(urls):
    results = []
    for url in tqdm(urls):
        result = fetch_data(url)
        results.append(result)
    return results


@spaces.GPU()
def classify_website(url):
    from unsloth import FastLanguageModel  # Import moved to the top for model loading

    global model, tokenizer  # Declare model and tokenizer as global variables

    if model is None or tokenizer is None:
    
        # Load the model and tokenizer during initialization (in the main process)
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=peft_model_name,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
        )
        FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

    
    urls = [url]
    results_shop = main(urls)

    # Convert results to DataFrame
    df_result_train_more = pd.DataFrame(results_shop)
    text = df_result_train_more['text'][0]
    translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])

    try:
        # Prepare the input prompt for the model
        prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Categorize the website into one of the 3 categories:
1) OTHER
2) NEWS/BLOG
3) E-commerce
### Input:
{translated}
### Response:"""

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
        ans = tokenizer.batch_decode(outputs)[0]
        ans_pred = ans.split('### Response:')[1].split('<')[0]

        if 'OTHER' in ans_pred:
            ans_pred = 'OTHER'
        elif 'NEWS/BLOG' in ans_pred:
            ans_pred = 'NEWS/BLOG'
        elif 'E-commerce' in ans_pred:
            ans_pred = 'E-commerce'

        return ans_pred

    except Exception as e:
        logging.exception(e)
        return str(e)

# Create a Gradio interface
iface = gr.Interface(
    fn=classify_website,
    inputs="text",
    outputs="text",
    title="Website Categorization",
    description="Categorize a website into one of the 3 categories: OTHER, NEWS/BLOG, or E-commerce."
)

# Launch the interface
iface.launch()