Spaces:

throaway2854
/

datasetbuilder

Paused

App Files Files Community

throaway2854 commited on Aug 18, 2024

Commit

ade988f

verified ·

1 Parent(s): 1f07e5c

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -5

app.py CHANGED Viewed

@@ -41,12 +41,31 @@ def make_request(url, cookies=None):
 def extract_image_url(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
     if script:
-        image_data = json.loads(script.string.split('=', 1)[1].strip().rstrip(';'))
-        return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
     img_tag = soup.find('img', alt=True)
     if img_tag and 'src' in img_tag.attrs:
         return img_tag['src']
@@ -55,14 +74,19 @@ def extract_image_url(html_content):
 def extract_tags(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     tag_elements = soup.find_all('li', class_='tag-type-general')
     tags = []
     for tag_element in tag_elements:
-        tag_link = tag_element.find_all('a')[1]
-        if tag_link:
-            tags.append(tag_link.text)
     return ','.join(tags)
 def download_image(url, cookies=None):

 def extract_image_url(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
+    # First, try to extract the image URL from the <script> tag
     script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
     if script:
+        try:
+            # Extract and clean the JavaScript object string
+            js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
+            # Log the string for debugging
+            print("Extracted JavaScript object string:", js_object_str)
+            # Replace single quotes with double quotes to make it a valid JSON string
+            js_object_str = js_object_str.replace("'", '"')
+            # Parse the JSON object
+            image_data = json.loads(js_object_str)
+            # Construct the full image URL
+            return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
+        except json.JSONDecodeError as e:
+            raise Exception(f"Failed to decode JSON: {str(e)}")
+    # If the script tag method fails, try to get the image URL from an <img alt> tag
     img_tag = soup.find('img', alt=True)
     if img_tag and 'src' in img_tag.attrs:
         return img_tag['src']
 def extract_tags(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
+    # Find all list items with the relevant class
     tag_elements = soup.find_all('li', class_='tag-type-general')
     tags = []
     for tag_element in tag_elements:
+        # The second <a> tag contains the relevant tag name
+        tag_links = tag_element.find_all('a')
+        if len(tag_links) > 1:
+            tag_name = tag_links[1].text
+            tags.append(tag_name)
+    # Join all tags into a single string separated by commas
     return ','.join(tags)
 def download_image(url, cookies=None):