Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -41,12 +41,31 @@ def make_request(url, cookies=None):
|
|
41 |
|
42 |
def extract_image_url(html_content):
|
43 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
44 |
script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
|
45 |
|
46 |
if script:
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
50 |
img_tag = soup.find('img', alt=True)
|
51 |
if img_tag and 'src' in img_tag.attrs:
|
52 |
return img_tag['src']
|
@@ -55,14 +74,19 @@ def extract_image_url(html_content):
|
|
55 |
|
56 |
def extract_tags(html_content):
|
57 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
58 |
tag_elements = soup.find_all('li', class_='tag-type-general')
|
59 |
|
60 |
tags = []
|
61 |
for tag_element in tag_elements:
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
65 |
|
|
|
66 |
return ','.join(tags)
|
67 |
|
68 |
def download_image(url, cookies=None):
|
|
|
41 |
|
42 |
def extract_image_url(html_content):
|
43 |
soup = BeautifulSoup(html_content, 'html.parser')
|
44 |
+
|
45 |
+
# First, try to extract the image URL from the <script> tag
|
46 |
script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
|
47 |
|
48 |
if script:
|
49 |
+
try:
|
50 |
+
# Extract and clean the JavaScript object string
|
51 |
+
js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
|
52 |
+
|
53 |
+
# Log the string for debugging
|
54 |
+
print("Extracted JavaScript object string:", js_object_str)
|
55 |
+
|
56 |
+
# Replace single quotes with double quotes to make it a valid JSON string
|
57 |
+
js_object_str = js_object_str.replace("'", '"')
|
58 |
+
|
59 |
+
# Parse the JSON object
|
60 |
+
image_data = json.loads(js_object_str)
|
61 |
+
|
62 |
+
# Construct the full image URL
|
63 |
+
return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
|
64 |
+
|
65 |
+
except json.JSONDecodeError as e:
|
66 |
+
raise Exception(f"Failed to decode JSON: {str(e)}")
|
67 |
|
68 |
+
# If the script tag method fails, try to get the image URL from an <img alt> tag
|
69 |
img_tag = soup.find('img', alt=True)
|
70 |
if img_tag and 'src' in img_tag.attrs:
|
71 |
return img_tag['src']
|
|
|
74 |
|
75 |
def extract_tags(html_content):
|
76 |
soup = BeautifulSoup(html_content, 'html.parser')
|
77 |
+
|
78 |
+
# Find all list items with the relevant class
|
79 |
tag_elements = soup.find_all('li', class_='tag-type-general')
|
80 |
|
81 |
tags = []
|
82 |
for tag_element in tag_elements:
|
83 |
+
# The second <a> tag contains the relevant tag name
|
84 |
+
tag_links = tag_element.find_all('a')
|
85 |
+
if len(tag_links) > 1:
|
86 |
+
tag_name = tag_links[1].text
|
87 |
+
tags.append(tag_name)
|
88 |
|
89 |
+
# Join all tags into a single string separated by commas
|
90 |
return ','.join(tags)
|
91 |
|
92 |
def download_image(url, cookies=None):
|