Spaces:
Running
Running
Joshua Lochner
commited on
Commit
•
62ea1e5
1
Parent(s):
4d4de75
Add language preference list
Browse files- src/preprocess.py +9 -2
src/preprocess.py
CHANGED
@@ -30,6 +30,12 @@ PROFANITY_CONVERTED = '*****' # Safer version for tokenizing
|
|
30 |
|
31 |
NUM_DECIMALS = 3
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def parse_transcript_json(json_data, granularity):
|
35 |
assert json_data['wireMagic'] == 'pb3'
|
@@ -203,9 +209,10 @@ def get_words(video_id, process=True, transcript_type='auto', fallback='manual',
|
|
203 |
if transcript_list is not None:
|
204 |
if transcript_type == 'manual':
|
205 |
ts = transcript_list.find_manually_created_transcript(
|
206 |
-
|
207 |
else:
|
208 |
-
ts = transcript_list.find_generated_transcript(
|
|
|
209 |
|
210 |
raw_transcript_json = ts._http_client.get(
|
211 |
f'{ts._url}&fmt=json3').json()
|
|
|
30 |
|
31 |
NUM_DECIMALS = 3
|
32 |
|
33 |
+
# https://www.fincher.org/Utilities/CountryLanguageList.shtml
|
34 |
+
# https://lingohub.com/developers/supported-locales/language-designators-with-regions
|
35 |
+
LANGUAGE_PREFERENCE_LIST = ['en-GB', 'en-US', 'en-CA', 'en-AU', 'en-NZ', 'en-ZA',
|
36 |
+
'en-IE', 'en-IN', 'en-JM', 'en-BZ', 'en-TT', 'en-PH', 'en-ZW',
|
37 |
+
'en']
|
38 |
+
|
39 |
|
40 |
def parse_transcript_json(json_data, granularity):
|
41 |
assert json_data['wireMagic'] == 'pb3'
|
|
|
209 |
if transcript_list is not None:
|
210 |
if transcript_type == 'manual':
|
211 |
ts = transcript_list.find_manually_created_transcript(
|
212 |
+
LANGUAGE_PREFERENCE_LIST)
|
213 |
else:
|
214 |
+
ts = transcript_list.find_generated_transcript(
|
215 |
+
LANGUAGE_PREFERENCE_LIST)
|
216 |
|
217 |
raw_transcript_json = ts._http_client.get(
|
218 |
f'{ts._url}&fmt=json3').json()
|