FairytaleDJ / scrape.py
Francesco's picture
first version with emotions
e997328
raw
history blame
3.03 kB
# def get_lyrics_url_from_website():
# # https://www.disneyclips.com/lyrics/
import asyncio
import json
from collections import defaultdict
from itertools import chain
from typing import List, Optional, Tuple, TypedDict
import aiohttp
from bs4 import BeautifulSoup
URL = "https://www.disneyclips.com/lyrics/"
async def get_lyrics_names_and_urls_from_movie_url(
movie_name: str, url: str, session: aiohttp.ClientSession
) -> List[Tuple[str, str]]:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", {"class": "songs"})
names_and_urls = []
if table:
links = table.find_all("a")
names_and_urls = []
for link in links:
names_and_urls.append(
(movie_name, link.text, f"{URL}/{link.get('href')}")
)
return names_and_urls
async def get_lyric_from_lyric_url(
movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession
) -> str:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"})
paragraphs = div.find_all("p")
text = ""
# first <p> has the lyric
p = paragraphs[0]
for br in p.find_all("br"):
br.replace_with(". ")
for span in p.find_all("span"):
span.decompose()
text += p.text
return (movie_name, lyric_name, text)
async def get_movie_names_and_urls(
session: aiohttp.ClientSession,
) -> List[Tuple[str, str]]:
async with session.get(URL) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
links = (
soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a")
)
movie_names_and_urls = [
(link.text, f"{URL}/{link.get('href')}") for link in links
]
return movie_names_and_urls
async def scrape_disney_lyrics():
async with aiohttp.ClientSession() as session:
data = await get_movie_names_and_urls(session)
data = await asyncio.gather(
*[
asyncio.create_task(
get_lyrics_names_and_urls_from_movie_url(*el, session)
)
for el in data
]
)
data = await asyncio.gather(
*[
asyncio.create_task(get_lyric_from_lyric_url(*data, session))
for data in chain(*data)
]
)
result = defaultdict(list)
for movie_name, lyric_name, lyric_text in data:
result[movie_name].append({"name": lyric_name, "text": lyric_text})
with open("data/lyrics.json", "w") as f:
json.dump(result, f)
loop = asyncio.get_event_loop()
loop.run_until_complete(scrape_disney_lyrics())