Gabriela Nicole Gonzalez Saez commited on
Commit
33fb482
1 Parent(s): 735ec79
.gitattributes CHANGED
@@ -34,3 +34,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  index/en-es_metadata_ref.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  index/en-es_metadata_ref.pkl filter=lfs diff=lfs merge=lfs -text
37
+ index/en-es_output_words.index filter=lfs diff=lfs merge=lfs -text
38
+ index/en-zh_metadata_ref.pkl filter=lfs diff=lfs merge=lfs -text
39
+ index/en-zh_output_words.index filter=lfs diff=lfs merge=lfs -text
40
+ index/en-ar_input_tokens.index filter=lfs diff=lfs merge=lfs -text
41
+ index/en-ar_output_tokens.index filter=lfs diff=lfs merge=lfs -text
42
+ index/en-fr_metadata_ref.pkl filter=lfs diff=lfs merge=lfs -text
43
+ index/en-fr_input_tokens.index filter=lfs diff=lfs merge=lfs -text
44
+ index/en-fr_input_words.index filter=lfs diff=lfs merge=lfs -text
45
+ index/en-fr_output_tokens.index filter=lfs diff=lfs merge=lfs -text
46
+ index/en-zh_input_tokens.index filter=lfs diff=lfs merge=lfs -text
47
+ index/en-ar_input_words.index filter=lfs diff=lfs merge=lfs -text
48
+ index/en-ar_metadata_ref.pkl filter=lfs diff=lfs merge=lfs -text
49
+ index/en-ar_output_words.index filter=lfs diff=lfs merge=lfs -text
50
+ index/en-fr_output_words.index filter=lfs diff=lfs merge=lfs -text
51
+ index/en-zh_input_words.index filter=lfs diff=lfs merge=lfs -text
52
+ index/en-zh_output_tokens.index filter=lfs diff=lfs merge=lfs -text
53
+ index/en-es_input_tokens.index filter=lfs diff=lfs merge=lfs -text
54
+ index/en-es_input_words.index filter=lfs diff=lfs merge=lfs -text
55
+ index/en-es_output_tokens.index filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ local_index/*
2
+ app_local.py
app.py CHANGED
@@ -23,23 +23,24 @@ from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelFor
23
  model_es = "Helsinki-NLP/opus-mt-en-es"
24
  model_fr = "Helsinki-NLP/opus-mt-en-fr"
25
  model_zh = "Helsinki-NLP/opus-mt-en-zh"
26
- model_sw = "Helsinki-NLP/opus-mt-en-sw"
27
 
28
  tokenizer_es = AutoTokenizer.from_pretrained(model_es)
29
  tokenizer_fr = AutoTokenizer.from_pretrained(model_fr)
30
  tokenizer_zh = AutoTokenizer.from_pretrained(model_zh)
31
- tokenizer_sw = AutoTokenizer.from_pretrained(model_sw)
32
 
33
  model_tr_es = MarianMTModel.from_pretrained(model_es)
34
  model_tr_fr = MarianMTModel.from_pretrained(model_fr)
35
  model_tr_zh = MarianMTModel.from_pretrained(model_zh)
36
- model_tr_sw = MarianMTModel.from_pretrained(model_sw)
37
 
38
  from faiss import write_index, read_index
39
  import pickle
40
 
41
  def load_index(model):
42
- with open('index/'+ model + '_metadata_ref.pkl', 'rb') as f:
 
43
  loaded_dict = pickle.load(f)
44
  for type in ['tokens','words']:
45
  for kind in ['input', 'output']:
@@ -54,25 +55,28 @@ dict_models = {
54
  'en-es': model_es,
55
  'en-fr': model_fr,
56
  'en-zh': model_zh,
57
- 'en-sw': model_sw,
58
  }
59
 
60
  dict_models_tr = {
61
  'en-es': model_tr_es,
62
  'en-fr': model_tr_fr,
63
  'en-zh': model_tr_zh,
64
- 'en-sw': model_tr_sw,
65
  }
66
 
67
  dict_tokenizer_tr = {
68
  'en-es': tokenizer_es,
69
  'en-fr': tokenizer_fr,
70
  'en-zh': tokenizer_zh,
71
- 'en-sw': tokenizer_sw,
72
  }
73
-
74
  dict_reference_faiss = {
75
  'en-es': load_index('en-es'),
 
 
 
76
  }
77
 
78
  # print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
@@ -698,11 +702,11 @@ html_embd = """
698
  """
699
 
700
  html_tok_target ="""
701
- <div id="d3_tok_target">... tokenization visualization ...</div>
702
  """
703
 
704
  html_embd_target= """
705
- <div id="d3_embd_target">... token embeddings visualization ...</div>
706
  <div id="d3_graph_output_words" class="d3_graph words"></div>
707
  <div id="d3_graph_output_tokens" class="d3_graph tokens"></div>
708
  <div id="similar_output_words" class=""></div>
@@ -823,7 +827,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
823
  """
824
  1. Select the language pair for the translation
825
  """)
826
- radio_c = gr.Radio(choices=['en-zh', 'en-es', 'en-fr', 'en-sw'], value="en-es", label= '', container=False)
827
  gr.Markdown(
828
  """
829
  2. Source text to translate
 
23
  model_es = "Helsinki-NLP/opus-mt-en-es"
24
  model_fr = "Helsinki-NLP/opus-mt-en-fr"
25
  model_zh = "Helsinki-NLP/opus-mt-en-zh"
26
+ model_ar = "Helsinki-NLP/opus-mt-en-ar"
27
 
28
  tokenizer_es = AutoTokenizer.from_pretrained(model_es)
29
  tokenizer_fr = AutoTokenizer.from_pretrained(model_fr)
30
  tokenizer_zh = AutoTokenizer.from_pretrained(model_zh)
31
+ tokenizer_ar = AutoTokenizer.from_pretrained(model_ar)
32
 
33
  model_tr_es = MarianMTModel.from_pretrained(model_es)
34
  model_tr_fr = MarianMTModel.from_pretrained(model_fr)
35
  model_tr_zh = MarianMTModel.from_pretrained(model_zh)
36
+ model_tr_ar = MarianMTModel.from_pretrained(model_ar)
37
 
38
  from faiss import write_index, read_index
39
  import pickle
40
 
41
  def load_index(model):
42
+ # with open('index/'+ model + '_metadata_ref.pkl', 'rb') as f:
43
+ with open('local_index/'+ model + '_metadata_ref.pkl', 'rb') as f:
44
  loaded_dict = pickle.load(f)
45
  for type in ['tokens','words']:
46
  for kind in ['input', 'output']:
 
55
  'en-es': model_es,
56
  'en-fr': model_fr,
57
  'en-zh': model_zh,
58
+ 'en-ar': model_ar,
59
  }
60
 
61
  dict_models_tr = {
62
  'en-es': model_tr_es,
63
  'en-fr': model_tr_fr,
64
  'en-zh': model_tr_zh,
65
+ 'en-ar': model_tr_ar,
66
  }
67
 
68
  dict_tokenizer_tr = {
69
  'en-es': tokenizer_es,
70
  'en-fr': tokenizer_fr,
71
  'en-zh': tokenizer_zh,
72
+ 'en-ar': tokenizer_ar,
73
  }
74
+ # dict_reference_faiss = {'en-es':[]}
75
  dict_reference_faiss = {
76
  'en-es': load_index('en-es'),
77
+ 'en-ar': load_index('en-ar'),
78
+ 'en-fr': load_index('en-fr'),
79
+ 'en-zh': load_index('en-zh'),
80
  }
81
 
82
  # print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
 
702
  """
703
 
704
  html_tok_target ="""
705
+ <div id="d3_tok_target"> </div>
706
  """
707
 
708
  html_embd_target= """
709
+ <div id="d3_embd_target"> </div>
710
  <div id="d3_graph_output_words" class="d3_graph words"></div>
711
  <div id="d3_graph_output_tokens" class="d3_graph tokens"></div>
712
  <div id="similar_output_words" class=""></div>
 
827
  """
828
  1. Select the language pair for the translation
829
  """)
830
+ radio_c = gr.Radio(choices=['en-zh', 'en-es', 'en-fr', 'en-ar'], value="en-es", label= '', container=False)
831
  gr.Markdown(
832
  """
833
  2. Source text to translate
index/en-ar_input_tokens.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:705af4d667addbc721da76b3099be10fcc437eeea0d30445751e1d7edbd2af19
3
+ size 754691
index/en-ar_input_words.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cac0ee3490002a39df24b27a6a7f570a60333d7f431c2f7aae2c4b1694d64948
3
+ size 756747
index/en-ar_metadata_ref.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37e6a98d587055a03daeceac1959cedd5ad793cf0fec6f1b377b1dcb5caa2c5f
3
+ size 28805589
index/en-ar_output_tokens.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd71fb94d70342945b2abdb658d6b04401c8cb2f98238d40919baecbd8f61781
3
+ size 1019915
index/en-ar_output_words.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:798941013c236a3be852512ddc8ff280969fee61f47336244e03da94d2eb8d52
3
+ size 892443
index/en-es_input_tokens.index CHANGED
Binary files a/index/en-es_input_tokens.index and b/index/en-es_input_tokens.index differ
 
index/en-es_input_words.index CHANGED
Binary files a/index/en-es_input_words.index and b/index/en-es_input_words.index differ
 
index/en-es_metadata_ref.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:545ee0536a47df856bb92082ef49ff5f3ac15d80c04b03c03fe8ebd8a089a356
3
- size 2042890
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f766d5a212ded9981ff504628d9ecadb567872e5244297582a3bc4ad0e3b774
3
+ size 25129757
index/en-es_output_tokens.index CHANGED
Binary files a/index/en-es_output_tokens.index and b/index/en-es_output_tokens.index differ
 
index/en-es_output_words.index CHANGED
Binary files a/index/en-es_output_words.index and b/index/en-es_output_words.index differ
 
index/en-fr_input_tokens.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e253777db5a5cc1eb76da856178d841f1a2814e11c7e7d4ac24be817e4983638
3
+ size 764971
index/en-fr_input_words.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:961b08022a7cea12c1dfbb8db0a2812e833980b0fcbd8cdf0455ee70f5ae778d
3
+ size 756747
index/en-fr_metadata_ref.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab1301d87b26b6af5d2cd54ad891ae53b817bfda08a0315f60964ae0bd57e6c5
3
+ size 27041562
index/en-fr_output_tokens.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b205bacecb6e4c8791e7c9820de772501ed974a19d365608646a69fe5cde3ff
3
+ size 910947
index/en-fr_output_words.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c90bf7ac638494a23efed842d3128f113f6ed279413b3362747892d7e9913e77
3
+ size 873939
index/en-zh_input_tokens.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098a6ed7021dd99b7dbeaf6174373081c73fe9e16375b9de79e56275997c035d
3
+ size 754691
index/en-zh_input_words.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:881ef345b1d0c689be2c98539beeacc0516ff35c571ac2f2eb75a8bb67fb5330
3
+ size 756747
index/en-zh_metadata_ref.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:223617175b67c586d07e10577e3e7168875eec66f7a660d1a3ada40bc05f864c
3
+ size 22426143
index/en-zh_output_tokens.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05a9caebc080364fec8685a5dd4f90bdc6f89a6dd2e39d8e649a0f3969e02639
3
+ size 880107
index/en-zh_output_words.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1293142f7b1639bd7ec7c618d12ca100ddb6ef91d5e20920fa62dacc8f5ee20d
3
+ size 242747