cyberandy commited on
Commit
f663df1
·
verified ·
1 Parent(s): fa155f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -30
app.py CHANGED
@@ -23,6 +23,37 @@ class WebsiteCrawler:
23
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
  }
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def clean_text(self, text, is_title=False):
27
  """Improved text cleaning"""
28
  if not text or len(text.strip()) < 2:
@@ -37,21 +68,73 @@ class WebsiteCrawler:
37
  text = re.sub(r'\{\%.*?\%\}', '', text)
38
  text = re.sub(r'\${.*?\}', '', text)
39
 
40
- # Remove broken/malformed text
41
- if len(re.findall(r'[a-zA-Z]', text)) < 10: # If less than 10 letters
42
- return ""
43
-
44
- # Clean up title specifically
45
  if is_title:
46
  # Remove common suffixes and fragments for titles
47
  text = re.sub(r'^\s*Welcome to\s+', '', text)
48
  text = re.sub(r'\s*[\|\-#:•].*', '', text)
49
- # Remove company name if it's redundant
50
- text = re.sub(r'\s*[-|]\s*.*?$', '', text)
51
- # Remove generic suffixes
52
  text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- return " ".join(text.split()).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def clean_description(self, desc):
57
  """Clean description text"""
@@ -64,25 +147,6 @@ class WebsiteCrawler:
64
  return ""
65
  return desc.strip()
66
 
67
- def determine_category_importance(self, url, title, desc):
68
- """Improved category detection"""
69
- url_lower = url.lower()
70
- path = urlparse(url).path.lower()
71
-
72
- if path == "/" or path == "":
73
- return "Main", 10
74
- elif any(x in url_lower for x in ['/docs', '/documentation', '/guide', '/manual']):
75
- return "Documentation", 8
76
- elif any(x in url_lower for x in ['/api', '/developer']):
77
- return "API", 8
78
- elif any(x in url_lower for x in ['/about', '/contact']):
79
- return "About", 7
80
- elif any(x in path for x in ['.html', '.md', '.txt', '/']):
81
- return "Content", 4
82
- elif any(x in url_lower for x in ['/blog', '/news', '/article']):
83
- return "Blog", 5
84
-
85
- return "Optional", 1
86
 
87
  def is_duplicate_content(self, desc, title, url):
88
  """Improved duplicate/translation detection"""
@@ -330,9 +394,9 @@ class WebsiteCrawler:
330
  "Main",
331
  "Documentation",
332
  "API",
 
333
  "About",
334
- "Content",
335
- "Blog",
336
  "Optional"
337
  ]
338
 
 
23
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
  }
25
 
26
+ def determine_category_importance(self, url, title, desc):
27
+ """Improved category detection"""
28
+ url_lower = url.lower()
29
+ path = urlparse(url).path.lower()
30
+
31
+ # Homepage
32
+ if path == "/" or path == "":
33
+ return "Main", 10
34
+
35
+ # Documentation and Features
36
+ if any(x in url_lower for x in ['/docs', '/documentation', '/features', '/pricing']):
37
+ return "Documentation", 8
38
+
39
+ # API
40
+ elif any(x in url_lower for x in ['/api', '/developer', 'developers']):
41
+ return "API", 8
42
+
43
+ # About/Company
44
+ elif any(x in url_lower for x in ['/about', '/company', '/partners', '/stories']):
45
+ return "About", 7
46
+
47
+ # News and Updates
48
+ elif any(x in url_lower for x in ['/news', '/blog', '/releases', '/academy']):
49
+ return "News", 5
50
+
51
+ # Tools and Features
52
+ elif any(x in url_lower for x in ['/tools', '/features', '/website', '/keyword']):
53
+ return "Tools", 6
54
+
55
+ return "Optional", 1
56
+
57
  def clean_text(self, text, is_title=False):
58
  """Improved text cleaning"""
59
  if not text or len(text.strip()) < 2:
 
68
  text = re.sub(r'\{\%.*?\%\}', '', text)
69
  text = re.sub(r'\${.*?\}', '', text)
70
 
 
 
 
 
 
71
  if is_title:
72
  # Remove common suffixes and fragments for titles
73
  text = re.sub(r'^\s*Welcome to\s+', '', text)
74
  text = re.sub(r'\s*[\|\-#:•].*', '', text)
 
 
 
75
  text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
76
+
77
+ # Handle overly generic titles
78
+ if text.lower() in ['features', 'home', 'homepage', 'welcome']:
79
+ return ""
80
+
81
+ # Only return if we have meaningful text
82
+ cleaned = " ".join(text.split()).strip()
83
+ if len(cleaned.split()) < 2 and not is_title: # Allow single-word titles
84
+ return ""
85
+
86
+ return cleaned
87
 
88
+ async def process_homepage(self, url):
89
+ """Specifically process the homepage to extract key metadata"""
90
+ try:
91
+ response = requests.get(url, headers=self.headers, timeout=10)
92
+ response.encoding = "utf-8"
93
+ soup = BeautifulSoup(response.text, "html.parser")
94
+
95
+ # Extract site name with more fallbacks
96
+ site_name = None
97
+ # Try meta tags first
98
+ site_meta = soup.find("meta", property="og:site_name")
99
+ if site_meta and site_meta.get("content"):
100
+ site_name = site_meta["content"]
101
+
102
+ # Try structured data
103
+ if not site_name:
104
+ schema = soup.find("script", type="application/ld+json")
105
+ if schema:
106
+ try:
107
+ import json
108
+ data = json.loads(schema.string)
109
+ if isinstance(data, dict):
110
+ site_name = data.get("name") or data.get("organizationName")
111
+ except:
112
+ pass
113
+
114
+ # Try title tag
115
+ if not site_name:
116
+ title_tag = soup.find("title")
117
+ if title_tag:
118
+ site_name = title_tag.text.split('|')[0].strip()
119
+
120
+ # Last resort - use domain name
121
+ if not site_name:
122
+ site_name = urlparse(url).netloc.split('.')[0].capitalize()
123
+
124
+ # Get homepage description
125
+ description = self.extract_homepage_description(soup)
126
+
127
+ self.homepage_metadata = {
128
+ "site_name": self.clean_text(site_name, is_title=True),
129
+ "description": description
130
+ }
131
+
132
+ except Exception as e:
133
+ logger.error(f"Error processing homepage {url}: {str(e)}")
134
+ self.homepage_metadata = {
135
+ "site_name": urlparse(url).netloc.split('.')[0].capitalize(),
136
+ "description": None
137
+ }
138
 
139
  def clean_description(self, desc):
140
  """Clean description text"""
 
147
  return ""
148
  return desc.strip()
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  def is_duplicate_content(self, desc, title, url):
152
  """Improved duplicate/translation detection"""
 
394
  "Main",
395
  "Documentation",
396
  "API",
397
+ "Tools",
398
  "About",
399
+ "News",
 
400
  "Optional"
401
  ]
402