throaway2854 commited on
Commit
ade988f
·
verified ·
1 Parent(s): 1f07e5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -5
app.py CHANGED
@@ -41,12 +41,31 @@ def make_request(url, cookies=None):
41
 
42
  def extract_image_url(html_content):
43
  soup = BeautifulSoup(html_content, 'html.parser')
 
 
44
  script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
45
 
46
  if script:
47
- image_data = json.loads(script.string.split('=', 1)[1].strip().rstrip(';'))
48
- return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
50
  img_tag = soup.find('img', alt=True)
51
  if img_tag and 'src' in img_tag.attrs:
52
  return img_tag['src']
@@ -55,14 +74,19 @@ def extract_image_url(html_content):
55
 
56
  def extract_tags(html_content):
57
  soup = BeautifulSoup(html_content, 'html.parser')
 
 
58
  tag_elements = soup.find_all('li', class_='tag-type-general')
59
 
60
  tags = []
61
  for tag_element in tag_elements:
62
- tag_link = tag_element.find_all('a')[1]
63
- if tag_link:
64
- tags.append(tag_link.text)
 
 
65
 
 
66
  return ','.join(tags)
67
 
68
  def download_image(url, cookies=None):
 
41
 
42
  def extract_image_url(html_content):
43
  soup = BeautifulSoup(html_content, 'html.parser')
44
+
45
+ # First, try to extract the image URL from the <script> tag
46
  script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
47
 
48
  if script:
49
+ try:
50
+ # Extract and clean the JavaScript object string
51
+ js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
52
+
53
+ # Log the string for debugging
54
+ print("Extracted JavaScript object string:", js_object_str)
55
+
56
+ # Replace single quotes with double quotes to make it a valid JSON string
57
+ js_object_str = js_object_str.replace("'", '"')
58
+
59
+ # Parse the JSON object
60
+ image_data = json.loads(js_object_str)
61
+
62
+ # Construct the full image URL
63
+ return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
64
+
65
+ except json.JSONDecodeError as e:
66
+ raise Exception(f"Failed to decode JSON: {str(e)}")
67
 
68
+ # If the script tag method fails, try to get the image URL from an <img alt> tag
69
  img_tag = soup.find('img', alt=True)
70
  if img_tag and 'src' in img_tag.attrs:
71
  return img_tag['src']
 
74
 
75
  def extract_tags(html_content):
76
  soup = BeautifulSoup(html_content, 'html.parser')
77
+
78
+ # Find all list items with the relevant class
79
  tag_elements = soup.find_all('li', class_='tag-type-general')
80
 
81
  tags = []
82
  for tag_element in tag_elements:
83
+ # The second <a> tag contains the relevant tag name
84
+ tag_links = tag_element.find_all('a')
85
+ if len(tag_links) > 1:
86
+ tag_name = tag_links[1].text
87
+ tags.append(tag_name)
88
 
89
+ # Join all tags into a single string separated by commas
90
  return ','.join(tags)
91
 
92
  def download_image(url, cookies=None):