HoneyTian commited on
Commit
94752cf
1 Parent(s): dd5fc6c
.gitignore CHANGED
@@ -6,6 +6,7 @@
6
  flagged/
7
  pretrained_models/
8
  temp/
 
9
 
10
  **/cache/
11
  **/__pycache__/
 
6
  flagged/
7
  pretrained_models/
8
  temp/
9
+ trained_models/
10
 
11
  **/cache/
12
  **/__pycache__/
main.py CHANGED
@@ -7,6 +7,8 @@ import argparse
7
  import json
8
  import platform
9
 
 
 
10
  import fasttext
11
  from fasttext.FastText import load_model, _FastText
12
  import gradio as gr
@@ -14,6 +16,7 @@ from gradio import inputs, outputs
14
  from langid.langid import LanguageIdentifier, model
15
 
16
  from project_settings import project_path, temp_directory
 
17
 
18
 
19
  def get_args():
@@ -39,11 +42,34 @@ def get_args():
39
 
40
  lang_id_identifier: LanguageIdentifier = None
41
  fasttext_model: _FastText = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  def click_lang_id_button(text: str, ground_true: str, model_name: str):
45
  global lang_id_identifier
46
  global fasttext_model
 
47
 
48
  text = str(text).strip()
49
 
@@ -53,6 +79,16 @@ def click_lang_id_button(text: str, ground_true: str, model_name: str):
53
  label, prob = fasttext_model.predict(text, k=1)
54
  label = label[0][9:]
55
  prob = prob[0]
 
 
 
 
 
 
 
 
 
 
56
  else:
57
  label = "model_name not available."
58
  prob = -1
@@ -76,15 +112,17 @@ def main():
76
 
77
  global lang_id_identifier
78
  global fasttext_model
 
79
  lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
80
  fasttext_model = fasttext.load_model(args.fasttext_model)
 
81
 
82
  blocks = gr.Interface(
83
  click_lang_id_button,
84
  inputs=[
85
  inputs.Textbox(lines=3, label="text"),
86
  inputs.Textbox(label="ground_true"),
87
- inputs.Dropdown(choices=["langid", "fasttext"], default="langid", label="model_name"),
88
  ],
89
  outputs=[
90
  outputs.Textbox(label="label"),
 
7
  import json
8
  import platform
9
 
10
+ from allennlp.models.archival import archive_model, load_archive
11
+ from allennlp.predictors.text_classifier import TextClassifierPredictor
12
  import fasttext
13
  from fasttext.FastText import load_model, _FastText
14
  import gradio as gr
 
16
  from langid.langid import LanguageIdentifier, model
17
 
18
  from project_settings import project_path, temp_directory
19
+ from toolbox.os.command import Command
20
 
21
 
22
  def get_args():
 
42
 
43
  lang_id_identifier: LanguageIdentifier = None
44
  fasttext_model: _FastText = None
45
+ qgyd_lang_id_predictor: TextClassifierPredictor = None
46
+
47
+
48
+ trained_model_dir = project_path / "trained_models/huggingface"
49
+ trained_model_dir.mkdir(parents=True, exist_ok=True)
50
+
51
+
52
+ def init_qgyd_lang_id_predictor() -> TextClassifierPredictor:
53
+ model_name = "qgyd2021/language_identification"
54
+ model_path = trained_model_dir / model_name
55
+ if not model_path.exists():
56
+ model_path.parent.mkdir(exist_ok=True)
57
+ Command.cd(model_path.parent.as_posix())
58
+ Command.popen("git clone https://huggingface.co/{}".format(model_name))
59
+
60
+ archive = load_archive(archive_file=model_path.as_posix())
61
+
62
+ predictor = TextClassifierPredictor(
63
+ model=archive.model,
64
+ dataset_reader=archive.dataset_reader,
65
+ )
66
+ return predictor
67
 
68
 
69
  def click_lang_id_button(text: str, ground_true: str, model_name: str):
70
  global lang_id_identifier
71
  global fasttext_model
72
+ global qgyd_lang_id_predictor
73
 
74
  text = str(text).strip()
75
 
 
79
  label, prob = fasttext_model.predict(text, k=1)
80
  label = label[0][9:]
81
  prob = prob[0]
82
+ elif model_name == "qgyd_lang_id_1":
83
+ json_dict = {
84
+ "sentence": text
85
+ }
86
+ outputs = qgyd_lang_id_predictor.predict_json(
87
+ json_dict
88
+ )
89
+ label = outputs["label"]
90
+ probs = outputs["probs"]
91
+ prob = max(probs)
92
  else:
93
  label = "model_name not available."
94
  prob = -1
 
112
 
113
  global lang_id_identifier
114
  global fasttext_model
115
+ global qgyd_lang_id_predictor
116
  lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
117
  fasttext_model = fasttext.load_model(args.fasttext_model)
118
+ qgyd_lang_id_predictor = init_qgyd_lang_id_predictor()
119
 
120
  blocks = gr.Interface(
121
  click_lang_id_button,
122
  inputs=[
123
  inputs.Textbox(lines=3, label="text"),
124
  inputs.Textbox(label="ground_true"),
125
+ inputs.Dropdown(choices=["langid", "fasttext", "qgyd_lang_id_1"], default="langid", label="model_name"),
126
  ],
127
  outputs=[
128
  outputs.Textbox(label="label"),
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
- gradio==2.1.1
2
- langid==1.1.6
3
  fasttext==0.9.2
 
 
 
 
 
 
1
  fasttext==0.9.2
2
+ langid==1.1.6
3
+ gradio==2.3.0
4
+ allennlp
toolbox/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ if __name__ == "__main__":
5
+ pass
toolbox/os/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == '__main__':
6
+ pass
toolbox/os/command.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ class Command(object):
5
+ custom_command = [
6
+ 'cd'
7
+ ]
8
+
9
+ @staticmethod
10
+ def _get_cmd(command):
11
+ command = str(command).strip()
12
+ if command == '':
13
+ return None
14
+ cmd_and_args = command.split(sep=' ')
15
+ cmd = cmd_and_args[0]
16
+ args = ' '.join(cmd_and_args[1:])
17
+ return cmd, args
18
+
19
+ @classmethod
20
+ def popen(cls, command):
21
+ cmd, args = cls._get_cmd(command)
22
+ if cmd in cls.custom_command:
23
+ method = getattr(cls, cmd)
24
+ return method(args)
25
+ else:
26
+ resp = os.popen(command)
27
+ result = resp.read()
28
+ resp.close()
29
+ return result
30
+
31
+ @classmethod
32
+ def cd(cls, args):
33
+ if args.startswith('/'):
34
+ os.chdir(args)
35
+ else:
36
+ pwd = os.getcwd()
37
+ path = os.path.join(pwd, args)
38
+ os.chdir(path)
39
+
40
+ @classmethod
41
+ def system(cls, command):
42
+ return os.system(command)
43
+
44
+ def __init__(self):
45
+ pass
46
+
47
+
48
+ def ps_ef_grep(keyword: str):
49
+ cmd = 'ps -ef | grep {}'.format(keyword)
50
+ rows = Command.popen(cmd)
51
+ rows = str(rows).split('\n')
52
+ rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__('grep')]
53
+ return rows
toolbox/os/environment.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import json
4
+ import os
5
+
6
+ from dotenv import load_dotenv
7
+ from dotenv.main import DotEnv
8
+
9
+ from smart.json.misc import traverse
10
+
11
+
12
+ class EnvironmentManager(object):
13
+ def __init__(self, path, env, override=False):
14
+ filename = os.path.join(path, '{}.env'.format(env))
15
+ self.filename = filename
16
+
17
+ load_dotenv(
18
+ dotenv_path=filename,
19
+ override=override
20
+ )
21
+
22
+ self._environ = dict()
23
+
24
+ def open_dotenv(self, filename: str = None):
25
+ filename = filename or self.filename
26
+ dotenv = DotEnv(
27
+ dotenv_path=filename,
28
+ stream=None,
29
+ verbose=False,
30
+ interpolate=False,
31
+ override=False,
32
+ encoding="utf-8",
33
+ )
34
+ result = dotenv.dict()
35
+ return result
36
+
37
+ def get(self, key, default=None, dtype=str):
38
+ result = os.environ.get(key)
39
+ if result is None:
40
+ if default is None:
41
+ result = None
42
+ else:
43
+ result = default
44
+ else:
45
+ result = dtype(result)
46
+ self._environ[key] = result
47
+ return result
48
+
49
+
50
+ _DEFAULT_DTYPE_MAP = {
51
+ 'int': int,
52
+ 'float': float,
53
+ 'str': str,
54
+ 'json.loads': json.loads
55
+ }
56
+
57
+
58
+ class JsonConfig(object):
59
+ """
60
+ 将 json 中, 形如 `$float:threshold` 的值, 处理为:
61
+ 从环境变量中查到 threshold, 再将其转换为 float 类型.
62
+ """
63
+ def __init__(self, dtype_map: dict = None, environment: EnvironmentManager = None):
64
+ self.dtype_map = dtype_map or _DEFAULT_DTYPE_MAP
65
+ self.environment = environment or os.environ
66
+
67
+ def sanitize_by_filename(self, filename: str):
68
+ with open(filename, 'r', encoding='utf-8') as f:
69
+ js = json.load(f)
70
+
71
+ return self.sanitize_by_json(js)
72
+
73
+ def sanitize_by_json(self, js):
74
+ js = traverse(
75
+ js,
76
+ callback=self.sanitize,
77
+ environment=self.environment
78
+ )
79
+ return js
80
+
81
+ def sanitize(self, string, environment):
82
+ """支持 $ 符开始的, 环境变量配置"""
83
+ if isinstance(string, str) and string.startswith('$'):
84
+ dtype, key = string[1:].split(':')
85
+ dtype = self.dtype_map[dtype]
86
+
87
+ value = environment.get(key)
88
+ if value is None:
89
+ raise AssertionError('environment not exist. key: {}'.format(key))
90
+
91
+ value = dtype(value)
92
+ result = value
93
+ else:
94
+ result = string
95
+ return result
96
+
97
+
98
+ def demo1():
99
+ import json
100
+
101
+ from project_settings import project_path
102
+
103
+ environment = EnvironmentManager(
104
+ path=os.path.join(project_path, 'server/callbot_server/dotenv'),
105
+ env='dev',
106
+ )
107
+ init_scenes = environment.get(key='init_scenes', dtype=json.loads)
108
+ print(init_scenes)
109
+ print(environment._environ)
110
+ return
111
+
112
+
113
+ if __name__ == '__main__':
114
+ demo1()
toolbox/os/other.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import inspect
3
+
4
+
5
+ def pwd():
6
+ """你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
7
+ frame = inspect.stack()[1]
8
+ module = inspect.getmodule(frame[0])
9
+ return os.path.dirname(os.path.abspath(module.__file__))