Spaces:
Runtime error
Runtime error
File size: 3,031 Bytes
e997328 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# def get_lyrics_url_from_website():
# # https://www.disneyclips.com/lyrics/
import asyncio
import json
from collections import defaultdict
from itertools import chain
from typing import List, Optional, Tuple, TypedDict
import aiohttp
from bs4 import BeautifulSoup
URL = "https://www.disneyclips.com/lyrics/"
async def get_lyrics_names_and_urls_from_movie_url(
movie_name: str, url: str, session: aiohttp.ClientSession
) -> List[Tuple[str, str]]:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", {"class": "songs"})
names_and_urls = []
if table:
links = table.find_all("a")
names_and_urls = []
for link in links:
names_and_urls.append(
(movie_name, link.text, f"{URL}/{link.get('href')}")
)
return names_and_urls
async def get_lyric_from_lyric_url(
movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession
) -> str:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"})
paragraphs = div.find_all("p")
text = ""
# first <p> has the lyric
p = paragraphs[0]
for br in p.find_all("br"):
br.replace_with(". ")
for span in p.find_all("span"):
span.decompose()
text += p.text
return (movie_name, lyric_name, text)
async def get_movie_names_and_urls(
session: aiohttp.ClientSession,
) -> List[Tuple[str, str]]:
async with session.get(URL) as response:
html = await response.text()
soup = BeautifulSoup(html, "html.parser")
links = (
soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a")
)
movie_names_and_urls = [
(link.text, f"{URL}/{link.get('href')}") for link in links
]
return movie_names_and_urls
async def scrape_disney_lyrics():
async with aiohttp.ClientSession() as session:
data = await get_movie_names_and_urls(session)
data = await asyncio.gather(
*[
asyncio.create_task(
get_lyrics_names_and_urls_from_movie_url(*el, session)
)
for el in data
]
)
data = await asyncio.gather(
*[
asyncio.create_task(get_lyric_from_lyric_url(*data, session))
for data in chain(*data)
]
)
result = defaultdict(list)
for movie_name, lyric_name, lyric_text in data:
result[movie_name].append({"name": lyric_name, "text": lyric_text})
with open("data/lyrics.json", "w") as f:
json.dump(result, f)
loop = asyncio.get_event_loop()
loop.run_until_complete(scrape_disney_lyrics())
|