limitedonly41 commited on
Commit
69e9c2b
·
verified ·
1 Parent(s): 295889a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -4
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  import torch
3
  import spaces
4
  import logging
 
 
5
 
6
  # Configure logging to write messages to a file
7
  logging.basicConfig(filename='app.log', level=logging.ERROR)
@@ -17,10 +19,183 @@ peft_model_name = "limitedonly41/website_mistral7b_v02_1200_finetuned_7"
17
  model = None
18
  tokenizer = None
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @spaces.GPU()
21
- def classify_website(site_text):
22
  global model, tokenizer # Declare model and tokenizer as global variables
23
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
  # Load the model and tokenizer if they are not already loaded
26
  if model is None or tokenizer is None:
@@ -45,7 +220,7 @@ Categorize the website into one of the 3 categories:
45
  3) E-commerce
46
 
47
  ### Input:
48
- {site_text}
49
 
50
  ### Response:"""
51
 
@@ -60,8 +235,8 @@ Categorize the website into one of the 3 categories:
60
  ans_pred = 'NEWS/BLOG'
61
  elif 'E-commerce' in ans_pred:
62
  ans_pred = 'E-commerce'
63
- else:
64
- ans_pred = 'OTHER'
65
 
66
  return ans_pred
67
 
 
2
  import torch
3
  import spaces
4
  import logging
5
+ from deep_translator import GoogleTranslator
6
+
7
 
8
  # Configure logging to write messages to a file
9
  logging.basicConfig(filename='app.log', level=logging.ERROR)
 
19
  model = None
20
  tokenizer = None
21
 
22
+
23
+
24
+
25
+ import pandas as pd
26
+ from tqdm import tqdm
27
+ import urllib
28
+ import aiohttp
29
+ import asyncio
30
+ from bs4 import BeautifulSoup
31
+
32
+ async def fetch_data(url):
33
+ headers = {
34
+ 'Accept': '*/*',
35
+ 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
36
+ 'Connection': 'keep-alive',
37
+ # 'Origin': 'https://www.beckman.es',
38
+ 'Referer': f'{url}',
39
+ 'Sec-Fetch-Dest': 'empty',
40
+ 'Sec-Fetch-Mode': 'cors',
41
+ 'Sec-Fetch-Site': 'cross-site',
42
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
43
+ 'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
44
+ 'sec-ch-ua-mobile': '?0',
45
+ 'sec-ch-ua-platform': '"macOS"',
46
+ }
47
+
48
+
49
+ # encoding = 'windows-1251'
50
+ encoding = 'utf-8'
51
+
52
+ timeout = 10 # Set your desired timeout value in seconds
53
+ try:
54
+ # Function to make the request using urllib
55
+ def get_content():
56
+ req = urllib.request.Request(url, headers=headers)
57
+ with urllib.request.urlopen(req, timeout=timeout) as response:
58
+ return response.read()
59
+
60
+ response_content = await loop.run_in_executor(None, get_content)
61
+
62
+ soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
63
+
64
+ title = soup.find('title').text
65
+ description = soup.find('meta', attrs={'name': 'description'})
66
+ if description and "content" in description.attrs:
67
+ description = description.get("content")
68
+ else:
69
+ description = ""
70
+
71
+ keywords = soup.find('meta', attrs={'name': 'keywords'})
72
+ if keywords and "content" in keywords.attrs:
73
+ keywords = keywords.get("content")
74
+ else:
75
+ keywords = ""
76
+
77
+ # h1_all = " ".join(h.text for h in soup.find_all('h1'))
78
+ # h2_all = " ".join(h.text for h in soup.find_all('h2'))
79
+ # h3_all = " ".join(h.text for h in soup.find_all('h3'))
80
+ # paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
81
+
82
+
83
+
84
+ h1 = soup.find_all('h1')
85
+ h1_all = ""
86
+
87
+ try:
88
+ for x in range (len(h1)):
89
+ if x == len(h1) -1:
90
+ h1_all = h1_all + h1[x].text
91
+ else:
92
+ h1_all = h1_all + h1[x].text + ". "
93
+ except:
94
+ h1_all = ""
95
+
96
+ paragraphs_all = ""
97
+ paragraphs = soup.find_all('p')
98
+ try:
99
+ for x in range (len(paragraphs)):
100
+ if x == len(paragraphs) -1:
101
+ paragraphs_all = paragraphs_all + paragraphs[x].text
102
+ else:
103
+ paragraphs_all = paragraphs_all + paragraphs[x].text + ". "
104
+ except:
105
+ paragraphs_all = ""
106
+
107
+ h2 = soup.find_all('h2')
108
+ h2_all = ""
109
+ try:
110
+ for x in range (len(h2)):
111
+ if x == len(h2) -1:
112
+ h2_all = h2_all + h2[x].text
113
+ else:
114
+ h2_all = h2_all + h2[x].text + ". "
115
+ except:
116
+ h2_all = ""
117
+
118
+ h3 = soup.find_all('h3')
119
+ h3_all = ""
120
+
121
+ try:
122
+ for x in range (len(h3)):
123
+ if x == len(h3) -1:
124
+ h3_all = h3_all + h3[x].text
125
+ else:
126
+ h3_all = h3_all + h3[x].text + ". "
127
+ except:
128
+ h3_all = ""
129
+
130
+
131
+
132
+ allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
133
+ allthecontent = allthecontent[:4999]
134
+
135
+ # Clean up the text
136
+ h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
137
+ h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
138
+ h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
139
+
140
+ title = title.replace(r'\xa0', ' ')
141
+ description = description.replace(r'\xa0', ' ')
142
+ keywords = keywords.replace(r'\xa0', ' ')
143
+
144
+ return {
145
+ 'url': url,
146
+ 'title': title,
147
+ 'description': description,
148
+ 'keywords': keywords,
149
+ 'h1': h1_all,
150
+ 'h2': h2_all,
151
+ 'h3': h3_all,
152
+ 'paragraphs': paragraphs_all,
153
+ 'text': allthecontent
154
+ }
155
+ except Exception as e:
156
+ print(url, e)
157
+ return {
158
+ 'url': url,
159
+ 'title': None,
160
+ 'description': None,
161
+ 'keywords': None,
162
+ 'h1': None,
163
+ 'h2': None,
164
+ 'h3': None,
165
+ 'paragraphs': None,
166
+ 'text': None
167
+ }
168
+
169
+ async def main(urls):
170
+ tasks = [fetch_data(url) for url in urls]
171
+ results = []
172
+ for future in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
173
+ result = await future
174
+ results.append(result)
175
+ return results
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
  @spaces.GPU()
184
+ def classify_website(url):
185
  global model, tokenizer # Declare model and tokenizer as global variables
186
 
187
+ urls = [url]
188
+
189
+ # Run asyncio event loop
190
+ loop = asyncio.get_event_loop()
191
+ results_shop = await main(urls[:]) # Instead of loop.run_until_complete(main(urls))
192
+
193
+ # Convert results to DataFrame
194
+ df_result_train_more = pd.DataFrame(results_shop)
195
+
196
+ text = df_result_train_more['text'][0]
197
+ translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
198
+
199
  try:
200
  # Load the model and tokenizer if they are not already loaded
201
  if model is None or tokenizer is None:
 
220
  3) E-commerce
221
 
222
  ### Input:
223
+ {translated}
224
 
225
  ### Response:"""
226
 
 
235
  ans_pred = 'NEWS/BLOG'
236
  elif 'E-commerce' in ans_pred:
237
  ans_pred = 'E-commerce'
238
+ # else:
239
+ # ans_pred = 'OTHER'
240
 
241
  return ans_pred
242