Joshua Lochner

Add youtube transcript api

a45bd3f over 2 years ago

5.13 kB

	import argparse

	from ._api import YouTubeTranscriptApi

	from .formatters import FormatterLoader


	class YouTubeTranscriptCli(object):
	def __init__(self, args):
	self._args = args

	def run(self):
	parsed_args = self._parse_args()

	if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
	return ''

	proxies = None
	if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
	proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}

	cookies = parsed_args.cookies

	transcripts = []
	exceptions = []

	for video_id in parsed_args.video_ids:
	try:
	transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
	except Exception as exception:
	exceptions.append(exception)

	return '\n\n'.join(
	[str(exception) for exception in exceptions]
	+ ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
	)

	def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)

	if parsed_args.list_transcripts:
	return str(transcript_list)

	if parsed_args.exclude_manually_created:
	transcript = transcript_list.find_generated_transcript(parsed_args.languages)
	elif parsed_args.exclude_generated:
	transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
	else:
	transcript = transcript_list.find_transcript(parsed_args.languages)

	if parsed_args.translate:
	transcript = transcript.translate(parsed_args.translate)

	return transcript.fetch()

	def _parse_args(self):
	parser = argparse.ArgumentParser(
	description=(
	'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
	'It also works for automatically generated subtitles and it does not require a headless browser, like '
	'other selenium based solutions do!'
	)
	)
	parser.add_argument(
	'--list-transcripts',
	action='store_const',
	const=True,
	default=False,
	help='This will list the languages in which the given videos are available in.',
	)
	parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
	parser.add_argument(
	'--languages',
	nargs='*',
	default=['en',],
	type=str,
	help=(
	'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
	'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
	'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
	'may have to play around with the language codes a bit, to find the one which is working for you!'
	),
	)
	parser.add_argument(
	'--exclude-generated',
	action='store_const',
	const=True,
	default=False,
	help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
	)
	parser.add_argument(
	'--exclude-manually-created',
	action='store_const',
	const=True,
	default=False,
	help='If this flag is set transcripts which have been manually created will not be retrieved.',
	)
	parser.add_argument(
	'--format',
	type=str,
	default='pretty',
	choices=tuple(FormatterLoader.TYPES.keys()),
	)
	parser.add_argument(
	'--translate',
	default='',
	help=(
	'The language code for the language you want this transcript to be translated to. Use the '
	'--list-transcripts feature to find out which languages are translatable and which translation '
	'languages are available.'
	)
	)
	parser.add_argument(
	'--http-proxy',
	default='',
	metavar='URL',
	help='Use the specified HTTP proxy.'
	)
	parser.add_argument(
	'--https-proxy',
	default='',
	metavar='URL',
	help='Use the specified HTTPS proxy.'
	)
	parser.add_argument(
	'--cookies',
	default=None,
	help='The cookie file that will be used for authorization with youtube.'
	)

	return self._sanitize_video_ids(parser.parse_args(self._args))

	def _sanitize_video_ids(self, args):
	args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
	return args