limitedonly41 commited on
Commit
5cc7129
·
verified ·
1 Parent(s): 604f9be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -120
app.py CHANGED
@@ -3,7 +3,12 @@ import torch
3
  import spaces
4
  import logging
5
  from deep_translator import GoogleTranslator
6
-
 
 
 
 
 
7
 
8
  # Configure logging to write messages to a file
9
  logging.basicConfig(filename='app.log', level=logging.ERROR)
@@ -12,44 +17,31 @@ logging.basicConfig(filename='app.log', level=logging.ERROR)
12
  max_seq_length = 2048
13
  dtype = None # Auto detection of dtype
14
  load_in_4bit = True # Use 4-bit quantization to reduce memory usage
15
-
16
- peft_model_name = "limitedonly41/website_mistral7b_v02_1200_finetuned_7"
17
 
18
  # Initialize model and tokenizer variables
19
  model = None
20
  tokenizer = None
21
 
22
-
23
-
24
-
25
- import pandas as pd
26
- from tqdm import tqdm
27
- import urllib
28
- import aiohttp
29
- import asyncio
30
- from bs4 import BeautifulSoup
31
-
32
  async def fetch_data(url):
33
  headers = {
34
  'Accept': '*/*',
35
  'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
36
  'Connection': 'keep-alive',
37
- # 'Origin': 'https://www.beckman.es',
38
  'Referer': f'{url}',
39
  'Sec-Fetch-Dest': 'empty',
40
  'Sec-Fetch-Mode': 'cors',
41
  'Sec-Fetch-Site': 'cross-site',
42
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
43
  'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
44
  'sec-ch-ua-mobile': '?0',
45
  'sec-ch-ua-platform': '"macOS"',
46
  }
47
 
48
-
49
- # encoding = 'windows-1251'
50
  encoding = 'utf-8'
51
-
52
  timeout = 10 # Set your desired timeout value in seconds
 
53
  try:
54
  # Function to make the request using urllib
55
  def get_content():
@@ -57,90 +49,26 @@ async def fetch_data(url):
57
  with urllib.request.urlopen(req, timeout=timeout) as response:
58
  return response.read()
59
 
 
 
60
  response_content = await loop.run_in_executor(None, get_content)
61
 
62
  soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
63
-
64
- title = soup.find('title').text
65
  description = soup.find('meta', attrs={'name': 'description'})
66
- if description and "content" in description.attrs:
67
- description = description.get("content")
68
- else:
69
- description = ""
70
 
71
  keywords = soup.find('meta', attrs={'name': 'keywords'})
72
- if keywords and "content" in keywords.attrs:
73
- keywords = keywords.get("content")
74
- else:
75
- keywords = ""
76
-
77
- # h1_all = " ".join(h.text for h in soup.find_all('h1'))
78
- # h2_all = " ".join(h.text for h in soup.find_all('h2'))
79
- # h3_all = " ".join(h.text for h in soup.find_all('h3'))
80
- # paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
81
-
82
-
83
-
84
- h1 = soup.find_all('h1')
85
- h1_all = ""
86
-
87
- try:
88
- for x in range (len(h1)):
89
- if x == len(h1) -1:
90
- h1_all = h1_all + h1[x].text
91
- else:
92
- h1_all = h1_all + h1[x].text + ". "
93
- except:
94
- h1_all = ""
95
-
96
- paragraphs_all = ""
97
- paragraphs = soup.find_all('p')
98
- try:
99
- for x in range (len(paragraphs)):
100
- if x == len(paragraphs) -1:
101
- paragraphs_all = paragraphs_all + paragraphs[x].text
102
- else:
103
- paragraphs_all = paragraphs_all + paragraphs[x].text + ". "
104
- except:
105
- paragraphs_all = ""
106
-
107
- h2 = soup.find_all('h2')
108
- h2_all = ""
109
- try:
110
- for x in range (len(h2)):
111
- if x == len(h2) -1:
112
- h2_all = h2_all + h2[x].text
113
- else:
114
- h2_all = h2_all + h2[x].text + ". "
115
- except:
116
- h2_all = ""
117
-
118
- h3 = soup.find_all('h3')
119
- h3_all = ""
120
-
121
- try:
122
- for x in range (len(h3)):
123
- if x == len(h3) -1:
124
- h3_all = h3_all + h3[x].text
125
- else:
126
- h3_all = h3_all + h3[x].text + ". "
127
- except:
128
- h3_all = ""
129
-
130
 
 
 
 
 
131
 
132
  allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
133
  allthecontent = allthecontent[:4999]
134
 
135
- # Clean up the text
136
- h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
137
- h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
138
- h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
139
-
140
- title = title.replace(r'\xa0', ' ')
141
- description = description.replace(r'\xa0', ' ')
142
- keywords = keywords.replace(r'\xa0', ' ')
143
-
144
  return {
145
  'url': url,
146
  'title': title,
@@ -152,8 +80,9 @@ async def fetch_data(url):
152
  'paragraphs': paragraphs_all,
153
  'text': allthecontent
154
  }
 
155
  except Exception as e:
156
- print(url, e)
157
  return {
158
  'url': url,
159
  'title': None,
@@ -166,6 +95,7 @@ async def fetch_data(url):
166
  'text': None
167
  }
168
 
 
169
  async def main(urls):
170
  tasks = [fetch_data(url) for url in urls]
171
  results = []
@@ -174,44 +104,37 @@ async def main(urls):
174
  results.append(result)
175
  return results
176
 
177
-
178
-
179
-
180
-
181
-
182
-
183
  @spaces.GPU()
184
  def classify_website(url):
185
- global model, tokenizer # Declare model and tokenizer as global variables
186
 
187
  urls = [url]
188
-
189
- # Run asyncio event loop
190
- loop = asyncio.get_event_loop()
191
- results_shop = await main(urls[:]) # Instead of loop.run_until_complete(main(urls))
192
-
 
193
  # Convert results to DataFrame
194
  df_result_train_more = pd.DataFrame(results_shop)
195
-
196
  text = df_result_train_more['text'][0]
197
  translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
198
 
199
  try:
200
- # Load the model and tokenizer if they are not already loaded
201
  if model is None or tokenizer is None:
202
  from unsloth import FastLanguageModel
203
-
204
- # Load the model and tokenizer
205
  model, tokenizer = FastLanguageModel.from_pretrained(
206
- model_name=peft_model_name, # YOUR MODEL YOU USED FOR TRAINING
207
  max_seq_length=max_seq_length,
208
  dtype=dtype,
209
  load_in_4bit=load_in_4bit,
210
  )
211
- FastLanguageModel.for_inference(model) # Enable native 2x faster inference
212
-
213
  prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
214
-
215
  ### Instruction:
216
  Categorize the website into one of the 3 categories:
217
 
@@ -223,21 +146,19 @@ Categorize the website into one of the 3 categories:
223
  {translated}
224
 
225
  ### Response:"""
226
-
227
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
228
  outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
229
  ans = tokenizer.batch_decode(outputs)[0]
230
  ans_pred = ans.split('### Response:')[1].split('<')[0]
231
-
232
  if 'OTHER' in ans_pred:
233
  ans_pred = 'OTHER'
234
  elif 'NEWS/BLOG' in ans_pred:
235
  ans_pred = 'NEWS/BLOG'
236
  elif 'E-commerce' in ans_pred:
237
  ans_pred = 'E-commerce'
238
- # else:
239
- # ans_pred = 'OTHER'
240
-
241
  return ans_pred
242
 
243
  except Exception as e:
@@ -252,7 +173,6 @@ iface = gr.Interface(
252
  title="Website Categorization",
253
  description="Categorize a website into one of the 3 categories: OTHER, NEWS/BLOG, or E-commerce."
254
  )
255
- iface.queue() # <-- Sets up a queue with default parameters
256
-
257
- # Launch the interface
258
  iface.launch()
 
 
3
  import spaces
4
  import logging
5
  from deep_translator import GoogleTranslator
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+ import urllib
9
+ import aiohttp
10
+ import asyncio
11
+ from bs4 import BeautifulSoup
12
 
13
  # Configure logging to write messages to a file
14
  logging.basicConfig(filename='app.log', level=logging.ERROR)
 
17
  max_seq_length = 2048
18
  dtype = None # Auto detection of dtype
19
  load_in_4bit = True # Use 4-bit quantization to reduce memory usage
20
+ peft_model_name = "limitedonly41/website_qwen2_7b_2"
 
21
 
22
  # Initialize model and tokenizer variables
23
  model = None
24
  tokenizer = None
25
 
26
+ # Async function to fetch data
 
 
 
 
 
 
 
 
 
27
  async def fetch_data(url):
28
  headers = {
29
  'Accept': '*/*',
30
  'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
31
  'Connection': 'keep-alive',
 
32
  'Referer': f'{url}',
33
  'Sec-Fetch-Dest': 'empty',
34
  'Sec-Fetch-Mode': 'cors',
35
  'Sec-Fetch-Site': 'cross-site',
36
+ 'User-Agent': 'Mozilla/5.0',
37
  'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
38
  'sec-ch-ua-mobile': '?0',
39
  'sec-ch-ua-platform': '"macOS"',
40
  }
41
 
 
 
42
  encoding = 'utf-8'
 
43
  timeout = 10 # Set your desired timeout value in seconds
44
+
45
  try:
46
  # Function to make the request using urllib
47
  def get_content():
 
49
  with urllib.request.urlopen(req, timeout=timeout) as response:
50
  return response.read()
51
 
52
+ # Async task using executor for blocking I/O
53
+ loop = asyncio.get_event_loop()
54
  response_content = await loop.run_in_executor(None, get_content)
55
 
56
  soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
57
+ title = soup.find('title').text if soup.find('title') else ""
 
58
  description = soup.find('meta', attrs={'name': 'description'})
59
+ description = description.get("content") if description and "content" in description.attrs else ""
 
 
 
60
 
61
  keywords = soup.find('meta', attrs={'name': 'keywords'})
62
+ keywords = keywords.get("content") if keywords and "content" in keywords.attrs else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ h1_all = ". ".join(h.text for h in soup.find_all('h1'))
65
+ paragraphs_all = ". ".join(p.text for p in soup.find_all('p'))
66
+ h2_all = ". ".join(h.text for h in soup.find_all('h2'))
67
+ h3_all = ". ".join(h.text for h in soup.find_all('h3'))
68
 
69
  allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
70
  allthecontent = allthecontent[:4999]
71
 
 
 
 
 
 
 
 
 
 
72
  return {
73
  'url': url,
74
  'title': title,
 
80
  'paragraphs': paragraphs_all,
81
  'text': allthecontent
82
  }
83
+
84
  except Exception as e:
85
+ logging.exception(f"Error fetching data for {url}: {e}")
86
  return {
87
  'url': url,
88
  'title': None,
 
95
  'text': None
96
  }
97
 
98
+ # Main async function to process multiple URLs
99
  async def main(urls):
100
  tasks = [fetch_data(url) for url in urls]
101
  results = []
 
104
  results.append(result)
105
  return results
106
 
 
 
 
 
 
 
107
  @spaces.GPU()
108
  def classify_website(url):
109
+ global model, tokenizer
110
 
111
  urls = [url]
112
+
113
+ # Start asyncio loop for fetching data
114
+ loop = asyncio.new_event_loop()
115
+ asyncio.set_event_loop(loop)
116
+ results_shop = loop.run_until_complete(main(urls)) # Correctly use asyncio loop
117
+
118
  # Convert results to DataFrame
119
  df_result_train_more = pd.DataFrame(results_shop)
 
120
  text = df_result_train_more['text'][0]
121
  translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
122
 
123
  try:
124
+ # Load the model and tokenizer if not already loaded
125
  if model is None or tokenizer is None:
126
  from unsloth import FastLanguageModel
127
+
 
128
  model, tokenizer = FastLanguageModel.from_pretrained(
129
+ model_name=peft_model_name,
130
  max_seq_length=max_seq_length,
131
  dtype=dtype,
132
  load_in_4bit=load_in_4bit,
133
  )
134
+ FastLanguageModel.for_inference(model)
135
+
136
  prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
137
+
138
  ### Instruction:
139
  Categorize the website into one of the 3 categories:
140
 
 
146
  {translated}
147
 
148
  ### Response:"""
149
+
150
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
151
  outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
152
  ans = tokenizer.batch_decode(outputs)[0]
153
  ans_pred = ans.split('### Response:')[1].split('<')[0]
154
+
155
  if 'OTHER' in ans_pred:
156
  ans_pred = 'OTHER'
157
  elif 'NEWS/BLOG' in ans_pred:
158
  ans_pred = 'NEWS/BLOG'
159
  elif 'E-commerce' in ans_pred:
160
  ans_pred = 'E-commerce'
161
+
 
 
162
  return ans_pred
163
 
164
  except Exception as e:
 
173
  title="Website Categorization",
174
  description="Categorize a website into one of the 3 categories: OTHER, NEWS/BLOG, or E-commerce."
175
  )
176
+ iface.queue() # Enable queue with default settings
 
 
177
  iface.launch()
178
+