File size: 5,617 Bytes
fef6972
 
8677f63
d173154
69e9c2b
5cc7129
 
 
 
d173154
 
 
8677f63
fef6972
 
 
 
e34c82b
7a45750
 
e354a1c
 
 
fef6972
e34c82b
69e9c2b
 
 
 
 
 
 
 
e34c82b
69e9c2b
 
 
 
 
 
 
 
e34c82b
 
 
 
69e9c2b
 
e34c82b
 
69e9c2b
5cc7129
69e9c2b
 
5cc7129
69e9c2b
5cc7129
 
 
 
69e9c2b
e34c82b
 
 
 
 
 
69e9c2b
 
 
 
 
 
 
 
 
 
 
 
 
e34c82b
69e9c2b
 
 
 
 
 
 
 
 
 
 
 
e34c82b
69e9c2b
e34c82b
 
69e9c2b
 
 
e354a1c
b0da584
69e9c2b
09a7523
 
e34c82b
ced26d2
7f6c2f2
 
 
 
 
 
 
 
 
 
 
 
ced26d2
 
 
 
 
 
dd5374d
c786964
dd5374d
b18caf8
 
080569b
 
 
 
 
 
69e9c2b
080569b
5cc7129
b18caf8
 
 
 
5cc7129
b18caf8
 
 
 
 
 
5cc7129
b18caf8
d173154
b18caf8
 
 
fef6972
 
 
dd5374d
 
 
fef6972
dd5374d
fef6972
5cc7129
e34c82b
dd5374d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import gradio as gr
import torch
import spaces
import logging
from deep_translator import GoogleTranslator
import pandas as pd
from tqdm import tqdm
import urllib
from bs4 import BeautifulSoup

# Configure logging to write messages to a file
logging.basicConfig(filename='app.log', level=logging.ERROR)

# Configuration
max_seq_length = 2048
dtype = None  # Auto detection of dtype
load_in_4bit = True  # Use 4-bit quantization to reduce memory usage

# peft_model_name = "limitedonly41/website_qwen2_7b_2"
peft_model_name = "limitedonly41/website_mistral7b_v02"
# Initialize model and tokenizer variables
model = None
tokenizer = None

def fetch_data(url):
    headers = {
        'Accept': '*/*',
        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
        'Connection': 'keep-alive',
        'Referer': f'{url}',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'cross-site',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
    }

    encoding = 'utf-8'
    timeout = 10  # Set your desired timeout value in seconds
    try:
        # Make the request using urllib
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=timeout) as response:
            response_content = response.read()

        soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)

        title = soup.find('title').text
        description = soup.find('meta', attrs={'name': 'description'})
        description = description.get("content") if description and "content" in description.attrs else ""

        keywords = soup.find('meta', attrs={'name': 'keywords'})
        keywords = keywords.get("content") if keywords and "content" in keywords.attrs else ""

        h1_all = ". ".join(h.text for h in soup.find_all('h1'))
        paragraphs_all = ". ".join(p.text for p in soup.find_all('p'))
        h2_all = ". ".join(h.text for h in soup.find_all('h2'))
        h3_all = ". ".join(h.text for h in soup.find_all('h3'))

        allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"[:4999]

        # Clean up the text
        h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
        h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
        h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')

        return {
            'url': url,
            'title': title,
            'description': description,
            'keywords': keywords,
            'h1': h1_all,
            'h2': h2_all,
            'h3': h3_all,
            'paragraphs': paragraphs_all,
            'text': allthecontent
        }
    except Exception as e:
        print(url, e)
        return {
            'url': url,
            'title': None,
            'description': None,
            'keywords': None,
            'h1': None,
            'h2': None,
            'h3': None,
            'paragraphs': None,
            'text': None
        }

def main(urls):
    results = []
    for url in tqdm(urls):
        result = fetch_data(url)
        results.append(result)
    return results


@spaces.GPU()
def classify_website(url):
    from unsloth import FastLanguageModel  # Import moved to the top for model loading

    global model, tokenizer  # Declare model and tokenizer as global variables

    if model is None or tokenizer is None:
    
        # Load the model and tokenizer during initialization (in the main process)
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=peft_model_name,
            max_seq_length=max_seq_length,
            dtype=dtype,
            load_in_4bit=load_in_4bit,
        )
        FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

    
    urls = [url]
    results_shop = main(urls)

    # Convert results to DataFrame
    df_result_train_more = pd.DataFrame(results_shop)
    text = df_result_train_more['text'][0]
    translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])

    try:
        # Prepare the input prompt for the model
        prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Categorize the website into one of the 3 categories:
1) OTHER
2) NEWS/BLOG
3) E-commerce
### Input:
{translated}
### Response:"""

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
        ans = tokenizer.batch_decode(outputs)[0]
        ans_pred = ans.split('### Response:')[1].split('<')[0]

        if 'OTHER' in ans_pred:
            ans_pred = 'OTHER'
        elif 'NEWS/BLOG' in ans_pred:
            ans_pred = 'NEWS/BLOG'
        elif 'E-commerce' in ans_pred:
            ans_pred = 'E-commerce'

        return ans_pred

    except Exception as e:
        logging.exception(e)
        return str(e)

# Create a Gradio interface
iface = gr.Interface(
    fn=classify_website,
    inputs="text",
    outputs="text",
    title="Website Categorization",
    description="Categorize a website into one of the 3 categories: OTHER, NEWS/BLOG, or E-commerce."
)

# Launch the interface
iface.launch()