entai2965 commited on
Commit
19b46b1
1 Parent(s): 02ea314

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +85 -0
README.md CHANGED
@@ -321,6 +321,91 @@ target = results[0].hypotheses[0][1:]
321
  print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
322
  ```
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  ## Available languages
325
  - https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
326
 
 
321
  print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
322
  ```
323
 
324
+ ## How to run this model (batch syntax)
325
+ ```
326
+ import os
327
+ import ctranslate2
328
+ import transformers
329
+
330
+ #set defaults
331
+ home_path=os.path.expanduser('~')
332
+ #model_folder=home_path+'/Downloads/models/nllb-200-distilled-600M-ctranslate2' #3 GB of memory
333
+ model_folder=home_path+'/Downloads/models/nllb-200-distilled-1.3B-ctranslate2' #5.5 GB of memory
334
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2-float16' #13 GB of memory in almost all cases, 7.6 GB on CUDA + GeForce RTX 2000 series and newer
335
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2' #13 GB of memory
336
+
337
+ string1='Hello world!'
338
+ string2='Awesome.'
339
+ raw_list=[string1, string2]
340
+
341
+ #https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
342
+ source_language_code = "eng_Latn"
343
+ target_language_code = "fra_Latn"
344
+
345
+ device='cpu'
346
+ #device='cuda'
347
+
348
+ #load models
349
+ translator = ctranslate2.Translator(model_folder,device=device)
350
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_folder, src_lang=source_language_code, clean_up_tokenization_spaces=True)
351
+
352
+ #tokenize input
353
+ encoded_list=[]
354
+ for text in raw_list:
355
+ encoded_list.append(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
356
+
357
+ #translate
358
+ #https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
359
+ translated_list = translator.translate_batch(encoded_list, target_prefix=[[target_language_code]]*len(raw_list))
360
+ assert(len(raw_list)==len(translated_list))
361
+
362
+ #decode
363
+ for counter,tokens in enumerate(translated_list):
364
+ translated_list[counter]=tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:]))
365
+
366
+ #output
367
+ for text in translated_list:
368
+ print(text)
369
+ ```
370
+
371
+ [Functional programming](https://docs.python.org/3/howto/functional.html) version
372
+
373
+ ```
374
+ import os
375
+ import ctranslate2
376
+ import transformers
377
+
378
+ #set defaults
379
+ home_path=os.path.expanduser('~')
380
+ #model_folder=home_path+'/Downloads/models/nllb-200-distilled-600M-ctranslate2' #3 GB of memory
381
+ model_folder=home_path+'/Downloads/models/nllb-200-distilled-1.3B-ctranslate2' #5.5 GB of memory
382
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2-float16' #13 GB of memory in almost all cases, 7.6 GB on CUDA + GeForce RTX 2000 series and newer
383
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2' #13 GB of memory
384
+
385
+ string1='Hello world!'
386
+ string2='Awesome.'
387
+ raw_list=[string1, string2]
388
+
389
+ #https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
390
+ source_language_code = "eng_Latn"
391
+ target_language_code = "fra_Latn"
392
+
393
+ device='cpu'
394
+ #device='cuda'
395
+
396
+ #load models
397
+ translator = ctranslate2.Translator(model_folder,device=device)
398
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_folder, src_lang=source_language_code, clean_up_tokenization_spaces=True)
399
+
400
+ #invoke black magic
401
+ translated_list=[tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:])) for tokens in translator.translate_batch([tokenizer.convert_ids_to_tokens(tokenizer.encode(text)) for text in raw_list], target_prefix=[[target_language_code]]*len(raw_list))]
402
+ assert(len(raw_list)==len(translated_list))
403
+
404
+ #output
405
+ for text in translated_list:
406
+ print(text)
407
+ ```
408
+
409
  ## Available languages
410
  - https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
411