lol_champion_pick_predictor / util /Leaderboard_scrapper.py
Jimin Park
added model
abcb943
raw
history blame
6.24 kB
import pandas as pd
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def scrape_leaderboards(regions=None, pages_per_region=5, output_file=None, delay=2):
"""
Scrape leaderboard data from op.gg for specified regions and return as DataFrame.
Args:
regions (list): List of regions to scrape. Defaults to ["kr", "na", "vn", "euw"]
pages_per_region (int): Number of pages to scrape per region. Defaults to 5
output_file (str): Path to output file. Defaults to "util/data/leaderboard_data.csv"
delay (int): Delay between requests in seconds. Defaults to 2
Returns:
pandas.DataFrame: Scraped leaderboard data
"""
# Set defaults
if regions is None:
regions = ["kr", "na", "vn", "euw"]
if output_file is None:
output_file = os.path.join("util", "data", "leaderboard_data.csv")
# Initialize data list to store rows
leaderboard_data = []
try:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-logging")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--disable-extensions")
chrome_options.page_load_strategy = 'eager'
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Initialize WebDriver
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
for region in regions:
print(f"\nScraping {region.upper()} region...")
for page in range(1, pages_per_region + 1):
print(f"Processing page {page}/{pages_per_region}")
url = f"https://www.op.gg/leaderboards/tier?region={region}&type=ladder&page={page}"
try:
# Access the webpage
driver.get(url)
# Wait for table to load
table = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "table.css-1l95r9q.e4dns9u11"))
)
# Process rows
rows = table.find_elements(By.TAG_NAME, "tr")[1:] # Skip header row
for row in rows:
try:
cells = row.find_elements(By.TAG_NAME, "td")
if len(cells) >= 7:
# Extract basic data
summoner = cells[1].text.strip().replace("\n", " ")
rank = cells[0].text.strip()
tier = cells[2].text.strip()
lp = cells[3].text.strip()
level = cells[5].text.strip()
# Extract champion data
champion_imgs = cells[4].find_elements(By.TAG_NAME, "img")
champions = [img.get_attribute("alt") for img in champion_imgs]
champion_data = champions + [""] * (3 - len(champions))
# Parse win/loss data
winrate_text = cells[6].text.strip().split("\n")
wins = winrate_text[0].rstrip("W") if len(winrate_text) > 0 else ""
losses = winrate_text[1].rstrip("L") if len(winrate_text) > 1 else ""
winrate = winrate_text[2] if len(winrate_text) > 2 else ""
# Append row data
leaderboard_data.append({
"summoner": summoner,
"region": region,
"rank": rank,
"tier": tier,
"lp": lp,
"most_champion_1": champion_data[0],
"most_champion_2": champion_data[1],
"most_champion_3": champion_data[2],
"level": level,
"win": wins,
"loss": losses,
"winrate": winrate
})
except Exception as e:
print(f"Error processing row in {region} page {page}: {e}")
continue
except Exception as e:
print(f"Error processing {region} page {page}: {e}")
continue
time.sleep(delay)
except Exception as e:
print(f"Fatal error: {e}")
return None
finally:
driver.quit()
# Create DataFrame
df = pd.DataFrame(leaderboard_data)
# Clean and convert data types
df['lp'] = df['lp'].str.replace(',', '').str.replace('LP', '').astype(float)
df['level'] = df['level'].astype(int)
df['win'] = pd.to_numeric(df['win'], errors='coerce')
df['loss'] = pd.to_numeric(df['loss'], errors='coerce')
df['winrate'] = df['winrate'].str.rstrip('%').astype(float) / 100
# Save to CSV if output_file is specified
if output_file:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
df.to_csv(output_file, index=False)
print(f"Leaderboard data saved to {output_file}")
return df