|
import os |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urljoin |
|
import urllib.request |
|
from tqdm.auto import tqdm |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.options import Options |
|
from selenium.webdriver.common.by import By |
|
import time |
|
|
|
|
|
def fetch_images(url): |
|
options = Options() |
|
options.add_argument("--headless") |
|
options.add_argument("window-size=1400,1500") |
|
options.add_argument("--disable-gpu") |
|
options.add_argument("--no-sandbox") |
|
options.add_argument("start-maximized") |
|
options.add_argument("enable-automation") |
|
options.add_argument("--disable-infobars") |
|
options.add_argument("--disable-dev-shm-usage") |
|
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
|
options.add_argument(f"user-agent={user_agent}") |
|
|
|
driver = webdriver.Chrome(options=options) |
|
driver.set_window_size(1400, 1500) |
|
driver.get(url) |
|
time.sleep(5) |
|
last_height = driver.execute_script("return document.scrollingElement.scrollHeight") |
|
urls = [] |
|
while True : |
|
|
|
img_elements = driver.find_elements(By.TAG_NAME, 'img') |
|
for img in img_elements: |
|
urls.append(img.get_attribute('src') or img.get_attribute('data-src')) |
|
driver.execute_script(f"document.scrollingElement.scrollTo(0, document.scrollingElement.scrollHeight);") |
|
time.sleep(5) |
|
new_height = driver.execute_script("return document.scrollingElement.scrollHeight") |
|
print(f"last_height: {last_height}, new_height: {new_height}") |
|
if new_height == last_height: |
|
break |
|
|
|
last_height = new_height |
|
|
|
print(f"Total images: {len(urls)}") |
|
|
|
return urls |
|
|
|
|
|
|
|
def download_images(img_urls, download_folder='downloaded_images', desc=""): |
|
if not os.path.exists(download_folder): |
|
os.makedirs(download_folder) |
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} |
|
|
|
for img_url in tqdm(img_urls, desc=desc): |
|
fname = img_url.split('/')[-1] |
|
fname = fname.split('?')[0] |
|
filename = os.path.join(download_folder, fname) |
|
if os.path.exists(filename): |
|
continue |
|
req = urllib.request.Request(img_url, headers=headers) |
|
with urllib.request.urlopen(req) as response, open(filename, 'wb') as out_file: |
|
data = response.read() |
|
out_file.write(data) |
|
|
|
|
|
|
|
def crawl(query): |
|
url = f'https://openart.ai/search/{query}?method=similarity' |
|
download_path = "/truemedia-eval/crawled-fakes/images/fakes" |
|
try: |
|
img_urls = fetch_images(url) |
|
if len(img_urls) == 0: |
|
return |
|
download_images(img_urls, download_path, desc=f"query: {query}") |
|
except: |
|
return |
|
|
|
|
|
def main(): |
|
|
|
queries = ['xi', 'trump', 'biden', 'putin', 'zelensky', 'gaza', 'israel', 'taylor', 'palesti', 'nsfw', 'instagram', |
|
"Hamas", "Hezbollah", "Iran", |
|
"North Korea", "Kim Jong-un", "Uyghurs", "Hong Kong", "Taiwan", "Tibet", "Kashmir", "Syria", "Assad", "ISIS", |
|
"Taliban", "Afghanistan", "Saudi", "Yemen", "Qatar", "Brexit", "Catalonia", "Black lives matter", "Antifa", "Proud Boys", |
|
"Abortion", "Guns", "Second Amendment", "Police", "Execution", "Drugs", "Opioids", "Climate", "Paris", |
|
"Green", "Keystone", "Fracking", "GMOs", "Net", "WikiLeaks", "Assange", "Snowden", "NSA", "Patriot", |
|
"COVID", "Vaccines", "Masks", "5G", "QAnon", "Fraud", "Capitol", "Impeachment", "Hunter", "Epstein", |
|
"Trafficking", "MeToo", "Weinstein", "Cancel", "Race", "Affirmative", "Transgender", "Marriage", |
|
"Religion", "Nationalism", "Neo-Nazism", "Confederate", "Antisemitism", "Islamophobia", "Wall", "ICE", |
|
"DACA", "Dreamers", "Separation", "Sanctuary", "Refugees", "Travel", "Benghazi", "Emails", "Russia", |
|
"Mueller", "Ukraine", "Mar-a-Lago", "Taxes", "Emoluments", "Gerrymandering", "Voter", "Mail", "Citizens", |
|
"PACs", "Breonna", "Floyd", "Rittenhouse", "Insurrection", "Ivanka", "Kushner", "Bannon", "Flynn", "Stone" |
|
] |
|
|
|
for query in queries: |
|
crawl(query) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|