Joshua Lochner commited on
Commit
62ea1e5
1 Parent(s): 4d4de75

Add language preference list

Browse files
Files changed (1) hide show
  1. src/preprocess.py +9 -2
src/preprocess.py CHANGED
@@ -30,6 +30,12 @@ PROFANITY_CONVERTED = '*****' # Safer version for tokenizing
30
 
31
  NUM_DECIMALS = 3
32
 
 
 
 
 
 
 
33
 
34
  def parse_transcript_json(json_data, granularity):
35
  assert json_data['wireMagic'] == 'pb3'
@@ -203,9 +209,10 @@ def get_words(video_id, process=True, transcript_type='auto', fallback='manual',
203
  if transcript_list is not None:
204
  if transcript_type == 'manual':
205
  ts = transcript_list.find_manually_created_transcript(
206
- ['en-GB', 'en-US', 'en'])
207
  else:
208
- ts = transcript_list.find_generated_transcript(['en'])
 
209
 
210
  raw_transcript_json = ts._http_client.get(
211
  f'{ts._url}&fmt=json3').json()
 
30
 
31
  NUM_DECIMALS = 3
32
 
33
+ # https://www.fincher.org/Utilities/CountryLanguageList.shtml
34
+ # https://lingohub.com/developers/supported-locales/language-designators-with-regions
35
+ LANGUAGE_PREFERENCE_LIST = ['en-GB', 'en-US', 'en-CA', 'en-AU', 'en-NZ', 'en-ZA',
36
+ 'en-IE', 'en-IN', 'en-JM', 'en-BZ', 'en-TT', 'en-PH', 'en-ZW',
37
+ 'en']
38
+
39
 
40
  def parse_transcript_json(json_data, granularity):
41
  assert json_data['wireMagic'] == 'pb3'
 
209
  if transcript_list is not None:
210
  if transcript_type == 'manual':
211
  ts = transcript_list.find_manually_created_transcript(
212
+ LANGUAGE_PREFERENCE_LIST)
213
  else:
214
+ ts = transcript_list.find_generated_transcript(
215
+ LANGUAGE_PREFERENCE_LIST)
216
 
217
  raw_transcript_json = ts._http_client.get(
218
  f'{ts._url}&fmt=json3').json()