distildire / deploy_utils /crawl_civt.py
Yewon Lim
first
424919d
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.request
from tqdm.auto import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
# Function to fetch and parse the webpage
def fetch_images(url):
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("start-maximized")
options.add_argument("enable-automation")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
options.add_argument(f"user-agent={user_agent}")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1400, 1500)
driver.get(url)
time.sleep(5)
last_height = driver.execute_script("return document.scrollingElement.scrollHeight")
urls = []
while True :
# img_elements = driver.find_elements(By.CLASS_NAME, cls_name)
img_elements = driver.find_elements(By.TAG_NAME, 'img')
for img in img_elements:
urls.append(img.get_attribute('src') or img.get_attribute('data-src'))
driver.execute_script(f"document.scrollingElement.scrollTo(0, document.scrollingElement.scrollHeight);")
time.sleep(5)
new_height = driver.execute_script("return document.scrollingElement.scrollHeight")
print(f"last_height: {last_height}, new_height: {new_height}")
if new_height == last_height:
break
last_height = new_height
print(f"Total images: {len(urls)}")
return urls
# Function to download images
def download_images(img_urls, download_folder='downloaded_images', desc=""):
if not os.path.exists(download_folder):
os.makedirs(download_folder)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
for img_url in tqdm(img_urls, desc=desc):
fname = img_url.split('/')[-1]
fname = fname.split('?')[0]
filename = os.path.join(download_folder, fname)
if os.path.exists(filename):
continue
req = urllib.request.Request(img_url, headers=headers)
with urllib.request.urlopen(req) as response, open(filename, 'wb') as out_file:
data = response.read()
out_file.write(data)
def crawl(query):
url = f'https://openart.ai/search/{query}?method=similarity'
download_path = "/truemedia-eval/crawled-fakes/images/fakes"
try:
img_urls = fetch_images(url)
if len(img_urls) == 0:
return
download_images(img_urls, download_path, desc=f"query: {query}")
except:
return
# Main function to coordinate the image crawling
def main():
# queries = ['xi', 'trump', 'biden', 'putin', 'zelensky', 'gaza', 'israel', 'taylor', 'palesti', 'nsfw', 'instagram']
queries = ['xi', 'trump', 'biden', 'putin', 'zelensky', 'gaza', 'israel', 'taylor', 'palesti', 'nsfw', 'instagram',
"Hamas", "Hezbollah", "Iran",
"North Korea", "Kim Jong-un", "Uyghurs", "Hong Kong", "Taiwan", "Tibet", "Kashmir", "Syria", "Assad", "ISIS",
"Taliban", "Afghanistan", "Saudi", "Yemen", "Qatar", "Brexit", "Catalonia", "Black lives matter", "Antifa", "Proud Boys",
"Abortion", "Guns", "Second Amendment", "Police", "Execution", "Drugs", "Opioids", "Climate", "Paris",
"Green", "Keystone", "Fracking", "GMOs", "Net", "WikiLeaks", "Assange", "Snowden", "NSA", "Patriot",
"COVID", "Vaccines", "Masks", "5G", "QAnon", "Fraud", "Capitol", "Impeachment", "Hunter", "Epstein",
"Trafficking", "MeToo", "Weinstein", "Cancel", "Race", "Affirmative", "Transgender", "Marriage",
"Religion", "Nationalism", "Neo-Nazism", "Confederate", "Antisemitism", "Islamophobia", "Wall", "ICE",
"DACA", "Dreamers", "Separation", "Sanctuary", "Refugees", "Travel", "Benghazi", "Emails", "Russia",
"Mueller", "Ukraine", "Mar-a-Lago", "Taxes", "Emoluments", "Gerrymandering", "Voter", "Mail", "Citizens",
"PACs", "Breonna", "Floyd", "Rittenhouse", "Insurrection", "Ivanka", "Kushner", "Bannon", "Flynn", "Stone"
]
for query in queries:
crawl(query)
if __name__ == "__main__":
main()