Joshua Lochner

Add youtube transcript api

a45bd3f almost 3 years ago

13.1 kB

	import sys

	# This can only be tested by using different python versions, therefore it is not covered by coverage.py
	if sys.version_info.major == 2: # pragma: no cover
	reload(sys)
	sys.setdefaultencoding('utf-8')

	import json

	from xml.etree import ElementTree

	import re

	from requests import HTTPError

	from ._html_unescaping import unescape
	from ._errors import (
	VideoUnavailable,
	TooManyRequests,
	YouTubeRequestFailed,
	NoTranscriptFound,
	TranscriptsDisabled,
	NotTranslatable,
	TranslationLanguageNotAvailable,
	NoTranscriptAvailable,
	FailedToCreateConsentCookie,
	)
	from ._settings import WATCH_URL


	def _raise_http_errors(response, video_id):
	try:
	response.raise_for_status()
	return response
	except HTTPError as error:
	raise YouTubeRequestFailed(error, video_id)


	class TranscriptListFetcher(object):
	def __init__(self, http_client):
	self._http_client = http_client

	def fetch(self, video_id):
	return TranscriptList.build(
	self._http_client,
	video_id,
	self._extract_captions_json(self._fetch_video_html(video_id), video_id)
	)

	def _extract_captions_json(self, html, video_id):
	splitted_html = html.split('"captions":')

	if len(splitted_html) <= 1:
	if 'class="g-recaptcha"' in html:
	raise TooManyRequests(video_id)
	if '"playabilityStatus":' not in html:
	raise VideoUnavailable(video_id)

	raise TranscriptsDisabled(video_id)

	captions_json = json.loads(
	splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
	).get('playerCaptionsTracklistRenderer')
	if captions_json is None:
	raise TranscriptsDisabled(video_id)

	if 'captionTracks' not in captions_json:
	raise NoTranscriptAvailable(video_id)

	return captions_json

	def _create_consent_cookie(self, html, video_id):
	match = re.search('name="v" value="(.*?)"', html)
	if match is None:
	raise FailedToCreateConsentCookie(video_id)
	self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')

	def _fetch_video_html(self, video_id):
	html = self._fetch_html(video_id)
	if 'action="https://consent.youtube.com/s"' in html:
	self._create_consent_cookie(html, video_id)
	html = self._fetch_html(video_id)
	if 'action="https://consent.youtube.com/s"' in html:
	raise FailedToCreateConsentCookie(video_id)
	return html

	def _fetch_html(self, video_id):
	response = self._http_client.get(WATCH_URL.format(video_id=video_id))
	return unescape(_raise_http_errors(response, video_id).text)


	class TranscriptList(object):
	"""
	This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
	for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
	"""
	def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
	"""
	The constructor is only for internal use. Use the static build method instead.

	:param video_id: the id of the video this TranscriptList is for
	:type video_id: str
	:param manually_created_transcripts: dict mapping language codes to the manually created transcripts
	:type manually_created_transcripts: dict[str, Transcript]
	:param generated_transcripts: dict mapping language codes to the generated transcripts
	:type generated_transcripts: dict[str, Transcript]
	:param translation_languages: list of languages which can be used for translatable languages
	:type translation_languages: list[dict[str, str]]
	"""
	self.video_id = video_id
	self._manually_created_transcripts = manually_created_transcripts
	self._generated_transcripts = generated_transcripts
	self._translation_languages = translation_languages

	@staticmethod
	def build(http_client, video_id, captions_json):
	"""
	Factory method for TranscriptList.

	:param http_client: http client which is used to make the transcript retrieving http calls
	:type http_client: requests.Session
	:param video_id: the id of the video this TranscriptList is for
	:type video_id: str
	:param captions_json: the JSON parsed from the YouTube pages static HTML
	:type captions_json: dict
	:return: the created TranscriptList
	:rtype TranscriptList:
	"""
	translation_languages = [
	{
	'language': translation_language['languageName']['simpleText'],
	'language_code': translation_language['languageCode'],
	} for translation_language in captions_json['translationLanguages']
	]

	manually_created_transcripts = {}
	generated_transcripts = {}

	for caption in captions_json['captionTracks']:
	if caption.get('kind', '') == 'asr':
	transcript_dict = generated_transcripts
	else:
	transcript_dict = manually_created_transcripts

	transcript_dict[caption['languageCode']] = Transcript(
	http_client,
	video_id,
	caption['baseUrl'],
	caption['name']['simpleText'],
	caption['languageCode'],
	caption.get('kind', '') == 'asr',
	translation_languages if caption.get('isTranslatable', False) else []
	)

	return TranscriptList(
	video_id,
	manually_created_transcripts,
	generated_transcripts,
	translation_languages,
	)

	def __iter__(self):
	return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))

	def find_transcript(self, language_codes):
	"""
	Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
	are found, generated transcripts are used. If you only want generated transcripts use
	`find_manually_created_transcript` instead.

	:param language_codes: A list of language codes in a descending priority. For example, if this is set to
	['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
	it fails to do so.
	:type languages: list[str]
	:return: the found Transcript
	:rtype Transcript:
	:raises: NoTranscriptFound
	"""
	return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])

	def find_generated_transcript(self, language_codes):
	"""
	Finds a automatically generated transcript for a given language code.

	:param language_codes: A list of language codes in a descending priority. For example, if this is set to
	['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
	it fails to do so.
	:type languages: list[str]
	:return: the found Transcript
	:rtype Transcript:
	:raises: NoTranscriptFound
	"""
	return self._find_transcript(language_codes, [self._generated_transcripts,])

	def find_manually_created_transcript(self, language_codes):
	"""
	Finds a manually created transcript for a given language code.

	:param language_codes: A list of language codes in a descending priority. For example, if this is set to
	['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
	it fails to do so.
	:type languages: list[str]
	:return: the found Transcript
	:rtype Transcript:
	:raises: NoTranscriptFound
	"""
	return self._find_transcript(language_codes, [self._manually_created_transcripts,])

	def _find_transcript(self, language_codes, transcript_dicts):
	for language_code in language_codes:
	for transcript_dict in transcript_dicts:
	if language_code in transcript_dict:
	return transcript_dict[language_code]

	raise NoTranscriptFound(
	self.video_id,
	language_codes,
	self
	)

	def __str__(self):
	return (
	'For this video ({video_id}) transcripts are available in the following languages:\n\n'
	'(MANUALLY CREATED)\n'
	'{available_manually_created_transcript_languages}\n\n'
	'(GENERATED)\n'
	'{available_generated_transcripts}\n\n'
	'(TRANSLATION LANGUAGES)\n'
	'{available_translation_languages}'
	).format(
	video_id=self.video_id,
	available_manually_created_transcript_languages=self._get_language_description(
	str(transcript) for transcript in self._manually_created_transcripts.values()
	),
	available_generated_transcripts=self._get_language_description(
	str(transcript) for transcript in self._generated_transcripts.values()
	),
	available_translation_languages=self._get_language_description(
	'{language_code} ("{language}")'.format(
	language=translation_language['language'],
	language_code=translation_language['language_code'],
	) for translation_language in self._translation_languages
	)
	)

	def _get_language_description(self, transcript_strings):
	description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
	return description if description else 'None'


	class Transcript(object):
	def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
	"""
	You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
	TranscriptList.

	:param http_client: http client which is used to make the transcript retrieving http calls
	:type http_client: requests.Session
	:param video_id: the id of the video this TranscriptList is for
	:type video_id: str
	:param url: the url which needs to be called to fetch the transcript
	:param language: the name of the language this transcript uses
	:param language_code:
	:param is_generated:
	:param translation_languages:
	"""
	self._http_client = http_client
	self.video_id = video_id
	self._url = url
	self.language = language
	self.language_code = language_code
	self.is_generated = is_generated
	self.translation_languages = translation_languages
	self._translation_languages_dict = {
	translation_language['language_code']: translation_language['language']
	for translation_language in translation_languages
	}

	def fetch(self):
	"""
	Loads the actual transcript data.

	:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
	:rtype [{'text': str, 'start': float, 'end': float}]:
	"""
	response = self._http_client.get(self._url)
	return _TranscriptParser().parse(
	_raise_http_errors(response, self.video_id).text,
	)

	def __str__(self):
	return '{language_code} ("{language}"){translation_description}'.format(
	language=self.language,
	language_code=self.language_code,
	translation_description='[TRANSLATABLE]' if self.is_translatable else ''
	)

	@property
	def is_translatable(self):
	return len(self.translation_languages) > 0

	def translate(self, language_code):
	if not self.is_translatable:
	raise NotTranslatable(self.video_id)

	if language_code not in self._translation_languages_dict:
	raise TranslationLanguageNotAvailable(self.video_id)

	return Transcript(
	self._http_client,
	self.video_id,
	'{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
	self._translation_languages_dict[language_code],
	language_code,
	True,
	[],
	)


	class _TranscriptParser(object):
	HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)

	def parse(self, plain_data):
	return [
	{
	'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
	'start': float(xml_element.attrib['start']),
	'duration': float(xml_element.attrib.get('dur', '0.0')),
	}
	for xml_element in ElementTree.fromstring(plain_data)
	if xml_element.text is not None
	]