Spaces:
Sleeping
Sleeping
update
Browse files- .gitignore +1 -1
- Dockerfile +3 -1
- install.sh +56 -0
- language_identification.md +13 -0
- main.py +39 -58
- requirements.txt +3 -2
.gitignore
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
.idea/
|
4 |
|
5 |
#data/
|
6 |
-
|
7 |
temp/
|
8 |
|
9 |
**/cache/
|
|
|
3 |
.idea/
|
4 |
|
5 |
#data/
|
6 |
+
pretrained_models/
|
7 |
temp/
|
8 |
|
9 |
**/cache/
|
Dockerfile
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
# you will also find guides on how best to write your Dockerfile
|
3 |
|
4 |
-
FROM python:3.
|
5 |
|
6 |
WORKDIR /code
|
7 |
|
@@ -27,4 +27,6 @@ WORKDIR $HOME/app
|
|
27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
COPY --chown=user . $HOME/app
|
29 |
|
|
|
|
|
30 |
CMD ["python", "main.py"]
|
|
|
1 |
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
# you will also find guides on how best to write your Dockerfile
|
3 |
|
4 |
+
FROM python:3.6
|
5 |
|
6 |
WORKDIR /code
|
7 |
|
|
|
27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
COPY --chown=user . $HOME/app
|
29 |
|
30 |
+
RUN bash -c 'bash install.sh --stage 1 --stop_stage 1 --system_version ubuntu'
|
31 |
+
|
32 |
CMD ["python", "main.py"]
|
install.sh
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
|
3 |
+
# bash install.sh --stage 1 --stop_stage 1 --system_version centos
|
4 |
+
|
5 |
+
verbose=true;
|
6 |
+
stage=-1
|
7 |
+
stop_stage=2
|
8 |
+
|
9 |
+
work_dir="$(pwd)"
|
10 |
+
|
11 |
+
|
12 |
+
# parse options
|
13 |
+
while true; do
|
14 |
+
[ -z "${1:-}" ] && break; # break if there are no arguments
|
15 |
+
case "$1" in
|
16 |
+
--*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
|
17 |
+
eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
|
18 |
+
old_value="(eval echo \\$$name)";
|
19 |
+
if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
|
20 |
+
was_bool=true;
|
21 |
+
else
|
22 |
+
was_bool=false;
|
23 |
+
fi
|
24 |
+
|
25 |
+
# Set the variable to the right value-- the escaped quotes make it work if
|
26 |
+
# the option had spaces, like --cmd "queue.pl -sync y"
|
27 |
+
eval "${name}=\"$2\"";
|
28 |
+
|
29 |
+
# Check that Boolean-valued arguments are really Boolean.
|
30 |
+
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
|
31 |
+
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
|
32 |
+
exit 1;
|
33 |
+
fi
|
34 |
+
shift 2;
|
35 |
+
;;
|
36 |
+
|
37 |
+
*) break;
|
38 |
+
esac
|
39 |
+
done
|
40 |
+
|
41 |
+
|
42 |
+
$verbose && echo "system_version: ${system_version}"
|
43 |
+
|
44 |
+
pretrained_models_dir="$(pwd)/pretrained_models"
|
45 |
+
|
46 |
+
mkdir -p "${pretrained_models_dir}"
|
47 |
+
|
48 |
+
|
49 |
+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
50 |
+
$verbose && echo "stage 1: download fasttext models"
|
51 |
+
cd "${pretrained_models_dir}" || exit 1;
|
52 |
+
|
53 |
+
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
|
54 |
+
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz
|
55 |
+
|
56 |
+
fi
|
language_identification.md
CHANGED
@@ -16,3 +16,16 @@ https://github.com/saffsd/langid.py/tree/master/langid/train
|
|
16 |
4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
|
17 |
|
18 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
|
17 |
|
18 |
```
|
19 |
+
|
20 |
+
|
21 |
+
### fasttext
|
22 |
+
|
23 |
+
识别 176 种语言。
|
24 |
+
https://fasttext.cc/docs/en/language-identification.html
|
25 |
+
|
26 |
+
|
27 |
+
### 参考
|
28 |
+
|
29 |
+
```text
|
30 |
+
https://zhuanlan.zhihu.com/p/600245782
|
31 |
+
```
|
main.py
CHANGED
@@ -6,14 +6,12 @@ https://huggingface.co/spaces/sayakpaul/demo-docker-gradio
|
|
6 |
import argparse
|
7 |
import json
|
8 |
import platform
|
9 |
-
from typing import Tuple
|
10 |
|
|
|
|
|
11 |
import gradio as gr
|
12 |
-
import
|
13 |
from langid.langid import LanguageIdentifier, model
|
14 |
-
import matplotlib.pyplot as plt
|
15 |
-
import numpy as np
|
16 |
-
from PIL import Image
|
17 |
|
18 |
from project_settings import project_path, temp_directory
|
19 |
|
@@ -30,29 +28,40 @@ def get_args():
|
|
30 |
default=(project_path / "lang_id_examples.json").as_posix(),
|
31 |
type=str
|
32 |
)
|
|
|
|
|
|
|
|
|
|
|
33 |
args = parser.parse_args()
|
34 |
return args
|
35 |
|
36 |
|
37 |
-
lang_id_identifier
|
|
|
38 |
|
39 |
|
40 |
def click_lang_id_button(text: str, ground_true: str, model_name: str):
|
41 |
global lang_id_identifier
|
|
|
42 |
|
43 |
if model_name == "langid":
|
44 |
label, prob = lang_id_identifier.classify(text)
|
|
|
|
|
|
|
|
|
45 |
else:
|
46 |
label = "model_name not available."
|
47 |
-
prob =
|
48 |
-
return label, round(prob, 4)
|
49 |
|
50 |
|
51 |
def main():
|
52 |
args = get_args()
|
53 |
|
54 |
brief_description = """
|
55 |
-
|
56 |
"""
|
57 |
|
58 |
# description
|
@@ -63,56 +72,28 @@ def main():
|
|
63 |
with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
|
64 |
lang_id_examples = json.load(f)
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
gr.Examples(
|
89 |
-
examples=lang_id_examples,
|
90 |
-
inputs=[
|
91 |
-
lang_id_text,
|
92 |
-
lang_id_ground_true,
|
93 |
-
lang_id_model_name,
|
94 |
-
],
|
95 |
-
outputs=[lang_id_label, lang_id_prob],
|
96 |
-
fn=click_lang_id_button
|
97 |
-
)
|
98 |
-
|
99 |
-
# click event
|
100 |
-
lang_id_button.click(
|
101 |
-
click_lang_id_button,
|
102 |
-
inputs=[
|
103 |
-
lang_id_text,
|
104 |
-
lang_id_ground_true,
|
105 |
-
lang_id_model_name,
|
106 |
-
],
|
107 |
-
outputs=[lang_id_label, lang_id_prob],
|
108 |
-
)
|
109 |
-
|
110 |
-
gr.Markdown(value=description)
|
111 |
-
|
112 |
-
blocks.queue().launch(
|
113 |
share=False if platform.system() == "Windows" else False,
|
114 |
-
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
|
115 |
-
server_port=7860
|
116 |
)
|
117 |
return
|
118 |
|
|
|
6 |
import argparse
|
7 |
import json
|
8 |
import platform
|
|
|
9 |
|
10 |
+
import fasttext
|
11 |
+
from fasttext.FastText import load_model, _FastText
|
12 |
import gradio as gr
|
13 |
+
from gradio import inputs, outputs
|
14 |
from langid.langid import LanguageIdentifier, model
|
|
|
|
|
|
|
15 |
|
16 |
from project_settings import project_path, temp_directory
|
17 |
|
|
|
28 |
default=(project_path / "lang_id_examples.json").as_posix(),
|
29 |
type=str
|
30 |
)
|
31 |
+
parser.add_argument(
|
32 |
+
"--fasttext_model",
|
33 |
+
default=(project_path / "pretrained_models/lid.176.bin").as_posix(),
|
34 |
+
type=str
|
35 |
+
)
|
36 |
args = parser.parse_args()
|
37 |
return args
|
38 |
|
39 |
|
40 |
+
lang_id_identifier: LanguageIdentifier = None
|
41 |
+
fasttext_model: _FastText = None
|
42 |
|
43 |
|
44 |
def click_lang_id_button(text: str, ground_true: str, model_name: str):
|
45 |
global lang_id_identifier
|
46 |
+
global fasttext_model
|
47 |
|
48 |
if model_name == "langid":
|
49 |
label, prob = lang_id_identifier.classify(text)
|
50 |
+
elif model_name == "fasttext":
|
51 |
+
label, prob = fasttext_model.predict(text, k=1)
|
52 |
+
label = label[0][9:]
|
53 |
+
prob = prob[0]
|
54 |
else:
|
55 |
label = "model_name not available."
|
56 |
+
prob = -1
|
57 |
+
return label, str(round(prob, 4))
|
58 |
|
59 |
|
60 |
def main():
|
61 |
args = get_args()
|
62 |
|
63 |
brief_description = """
|
64 |
+
Language Identification
|
65 |
"""
|
66 |
|
67 |
# description
|
|
|
72 |
with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
|
73 |
lang_id_examples = json.load(f)
|
74 |
|
75 |
+
global lang_id_identifier
|
76 |
+
global fasttext_model
|
77 |
+
lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
|
78 |
+
fasttext_model = fasttext.load_model(args.fasttext_model)
|
79 |
+
|
80 |
+
blocks = gr.Interface(
|
81 |
+
click_lang_id_button,
|
82 |
+
inputs=[
|
83 |
+
inputs.Textbox(lines=3, label="text"),
|
84 |
+
inputs.Textbox(label="ground_true"),
|
85 |
+
inputs.Dropdown(choices=["langid", "fasttext"], default="langid", label="model_name"),
|
86 |
+
],
|
87 |
+
outputs=[
|
88 |
+
outputs.Textbox(label="label"),
|
89 |
+
outputs.Textbox(label="prob"),
|
90 |
+
],
|
91 |
+
examples=lang_id_examples,
|
92 |
+
description=brief_description
|
93 |
+
)
|
94 |
+
|
95 |
+
blocks.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
share=False if platform.system() == "Windows" else False,
|
|
|
|
|
97 |
)
|
98 |
return
|
99 |
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
-
gradio==
|
2 |
-
langid==1.1.6
|
|
|
|
1 |
+
gradio==2.1.1
|
2 |
+
langid==1.1.6
|
3 |
+
fasttext==0.9.2
|