Spaces:

Francesco
/

FairytaleDJ

Runtime error

App Files Files Community

FairytaleDJ / scrape.py

Francesco

first version with emotions

e997328 over 1 year ago

raw

history blame

3.03 kB

	# def get_lyrics_url_from_website():
	# # https://www.disneyclips.com/lyrics/

	import asyncio
	import json
	from collections import defaultdict
	from itertools import chain
	from typing import List, Optional, Tuple, TypedDict

	import aiohttp
	from bs4 import BeautifulSoup

	URL = "https://www.disneyclips.com/lyrics/"


	async def get_lyrics_names_and_urls_from_movie_url(
	movie_name: str, url: str, session: aiohttp.ClientSession
	) -> List[Tuple[str, str]]:
	async with session.get(url) as response:
	html = await response.text()
	soup = BeautifulSoup(html, "html.parser")
	table = soup.find("table", {"class": "songs"})
	names_and_urls = []
	if table:
	links = table.find_all("a")
	names_and_urls = []
	for link in links:
	names_and_urls.append(
	(movie_name, link.text, f"{URL}/{link.get('href')}")
	)
	return names_and_urls


	async def get_lyric_from_lyric_url(
	movie_name: str, lyric_name: str, url: str, session: aiohttp.ClientSession
	) -> str:
	async with session.get(url) as response:
	html = await response.text()
	soup = BeautifulSoup(html, "html.parser")
	div = soup.find("div", {"id": "cnt"}).find("div", {"class": "main"})
	paragraphs = div.find_all("p")
	text = ""
	# first <p> has the lyric
	p = paragraphs[0]
	for br in p.find_all("br"):
	br.replace_with(". ")
	for span in p.find_all("span"):
	span.decompose()
	text += p.text

	return (movie_name, lyric_name, text)


	async def get_movie_names_and_urls(
	session: aiohttp.ClientSession,
	) -> List[Tuple[str, str]]:
	async with session.get(URL) as response:
	html = await response.text()
	soup = BeautifulSoup(html, "html.parser")
	links = (
	soup.find("div", {"id": "cnt"}).find("div", {"class": "main"}).find_all("a")
	)
	movie_names_and_urls = [
	(link.text, f"{URL}/{link.get('href')}") for link in links
	]
	return movie_names_and_urls


	async def scrape_disney_lyrics():
	async with aiohttp.ClientSession() as session:
	data = await get_movie_names_and_urls(session)
	data = await asyncio.gather(
	*[
	asyncio.create_task(
	get_lyrics_names_and_urls_from_movie_url(*el, session)
	)
	for el in data
	]
	)
	data = await asyncio.gather(
	*[
	asyncio.create_task(get_lyric_from_lyric_url(*data, session))
	for data in chain(*data)
	]
	)

	result = defaultdict(list)

	for movie_name, lyric_name, lyric_text in data:
	result[movie_name].append({"name": lyric_name, "text": lyric_text})

	with open("data/lyrics.json", "w") as f:
	json.dump(result, f)


	loop = asyncio.get_event_loop()
	loop.run_until_complete(scrape_disney_lyrics())