Yehor Smoliakov commited on
Commit
6752be4
1 Parent(s): 37d0c2d
Files changed (7) hide show
  1. .gitattributes +0 -35
  2. .gitignore +5 -0
  3. Dockerfile +61 -0
  4. README.md +28 -6
  5. app.py +177 -0
  6. requirements-dev.txt +1 -0
  7. requirements.txt +9 -0
.gitattributes CHANGED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .idea/
2
+ .venv/
3
+ .ruff_cache/
4
+
5
+ flagged/
Dockerfile ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+
5
+ RUN apt-get update && \
6
+ apt-get upgrade -y && \
7
+ apt-get install -y --no-install-recommends \
8
+ git \
9
+ git-lfs \
10
+ wget \
11
+ curl \
12
+ # python build dependencies \
13
+ build-essential \
14
+ libssl-dev \
15
+ zlib1g-dev \
16
+ libbz2-dev \
17
+ libreadline-dev \
18
+ libsqlite3-dev \
19
+ libncursesw5-dev \
20
+ xz-utils \
21
+ tk-dev \
22
+ libxml2-dev \
23
+ libxmlsec1-dev \
24
+ libffi-dev \
25
+ liblzma-dev \
26
+ # gradio dependencies \
27
+ ffmpeg \
28
+ && apt-get clean \
29
+ && rm -rf /var/lib/apt/lists/*
30
+
31
+
32
+ RUN useradd -m -u 1000 user
33
+ USER user
34
+ ENV HOME=/home/user \
35
+ PATH=/home/user/.local/bin:${PATH}
36
+ WORKDIR ${HOME}/app
37
+
38
+ RUN curl https://pyenv.run | bash
39
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
40
+ ARG PYTHON_VERSION=3.10.12
41
+ RUN pyenv install ${PYTHON_VERSION} && \
42
+ pyenv global ${PYTHON_VERSION} && \
43
+ pyenv rehash && \
44
+ pip install --no-cache-dir -U pip setuptools wheel && \
45
+ pip install packaging ninja
46
+
47
+ COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
48
+ RUN pip install youtokentome --no-build-isolation
49
+ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
50
+
51
+
52
+ COPY --chown=1000 . ${HOME}/app
53
+ ENV PYTHONPATH=${HOME}/app \
54
+ PYTHONUNBUFFERED=1 \
55
+ GRADIO_ALLOW_FLAGGING=never \
56
+ GRADIO_NUM_PORTS=1 \
57
+ GRADIO_SERVER_NAME=0.0.0.0 \
58
+ GRADIO_THEME=huggingface \
59
+ SYSTEM=spaces
60
+
61
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,33 @@
1
  ---
2
- title: Punctuation Uk
3
- emoji: 😻
4
  colorFrom: blue
5
- colorTo: pink
6
- sdk: docker
 
7
  pinned: false
8
- license: apache-2.0
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Restore Punctuation and Capitalization for Ukrainian
3
+ emoji: ✍️
4
  colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ app_file: app.py
8
  pinned: false
9
+ sdk_version: 4.39.0
10
  ---
11
 
12
+ ## Install
13
+
14
+ ```shell
15
+ uv venv --python 3.10
16
+
17
+ source .venv/bin/activate
18
+
19
+ uv pip install Cython packaging
20
+
21
+ uv pip install youtokentome --no-build-isolation
22
+
23
+ SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True uv pip install -r requirements.txt
24
+
25
+ # in development mode
26
+ uv pip install -r requirements-dev.txt
27
+ ```
28
+
29
+ ## Run
30
+
31
+ ```shell
32
+ python app.py
33
+ ```
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+
4
+ # import torch
5
+ import gradio as gr
6
+
7
+ from nemo import __version__ as nemo_version
8
+ from nemo.collections.nlp.models import PunctuationCapitalizationModel
9
+
10
+ # Config
11
+ model_name = "ai-forever/T5-large-spell"
12
+
13
+ concurrency_limit = 5
14
+
15
+ # Torch
16
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
18
+
19
+ # Load the model
20
+ model = PunctuationCapitalizationModel.from_pretrained(
21
+ "dchaplinsky/punctuation_uk_bert"
22
+ )
23
+
24
+ examples = [
25
+ "тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену",
26
+ "всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування",
27
+ "хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні",
28
+ "використовує на фронті все що має і хімічна зброя не вийняток тож з чим маємо справу розбиралася марія моганисян",
29
+ "двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами",
30
+ "на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини",
31
+ ]
32
+
33
+ title = "Restore Punctuation and Capitalization for Ukrainian"
34
+
35
+ # https://www.tablesgenerator.com/markdown_tables
36
+ authors_table = """
37
+ ## Authors
38
+
39
+ Follow them on social networks and **contact** if you need any help or have any questions:
40
+
41
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
42
+ |-------------------------------------------------------------------------------------------------|
43
+ | https://t.me/smlkw in Telegram |
44
+ | https://x.com/yehor_smoliakov at X |
45
+ | https://github.com/egorsmkv at GitHub |
46
+ | https://huggingface.co/Yehor at Hugging Face |
47
+ | or use egorsmkv@gmail.com |
48
+ """.strip()
49
+
50
+ description_head = f"""
51
+ # {title}
52
+
53
+ ## Overview
54
+
55
+ This space uses https://huggingface.co/dchaplinsky/punctuation_uk_bert model.
56
+
57
+ Paste the text you want to enhance.
58
+ """.strip()
59
+
60
+ description_foot = f"""
61
+ {authors_table}
62
+ """.strip()
63
+
64
+ enhanced_text_value = """
65
+ Enhanced text will appear here.
66
+
67
+ Choose **an example** below the Enhance button or paste **your text**.
68
+ """.strip()
69
+
70
+ tech_env = f"""
71
+ #### Environment
72
+
73
+ - Python: {sys.version}
74
+ """.strip()
75
+
76
+ tech_libraries = f"""
77
+ #### Libraries
78
+
79
+ - nemo: {nemo_version}
80
+ - gradio: {gr.__version__}
81
+ """.strip()
82
+
83
+
84
+ def inference(text, progress=gr.Progress()):
85
+ if not text:
86
+ raise gr.Error("Please paste your text.")
87
+
88
+ gr.Info("Starting enhancing", duration=2)
89
+
90
+ progress(0, desc="Enhancing...")
91
+
92
+ results = []
93
+
94
+ sentences = [
95
+ text,
96
+ ]
97
+
98
+ for sentence in progress.tqdm(sentences, desc="Enhancing...", unit="sentence"):
99
+ sentence = sentence.strip()
100
+
101
+ if len(sentence) == 0:
102
+ continue
103
+
104
+ t0 = time.time()
105
+
106
+ predictions = model.add_punctuation_capitalization([sentence])
107
+
108
+ if not predictions:
109
+ predictions = "-"
110
+
111
+ elapsed_time = round(time.time() - t0, 2)
112
+
113
+ enhanced_text = "\n".join(predictions)
114
+
115
+ if sentence != enhanced_text:
116
+ enhanced_text = enhanced_text.strip()
117
+ results.append(
118
+ {
119
+ "sentence": sentence,
120
+ "enhanced_text": enhanced_text,
121
+ "elapsed_time": elapsed_time,
122
+ }
123
+ )
124
+
125
+ gr.Info("Finished!", duration=2)
126
+
127
+ result_texts = []
128
+
129
+ for result in results:
130
+ result_texts.append(f'> {result["enhanced_text"]}')
131
+ result_texts.append("\n")
132
+
133
+ sum_elapsed_text = sum([result["elapsed_time"] for result in results])
134
+ result_texts.append(f"Elapsed time: {sum_elapsed_text} seconds")
135
+
136
+ return "\n".join(result_texts)
137
+
138
+
139
+ demo = gr.Blocks(
140
+ title=title,
141
+ analytics_enabled=False,
142
+ theme=gr.themes.Base(),
143
+ )
144
+
145
+ with demo:
146
+ gr.Markdown(description_head)
147
+
148
+ gr.Markdown("## Usage")
149
+
150
+ with gr.Row():
151
+ text = gr.Textbox(label="Text", autofocus=True, max_lines=1)
152
+ enhanced_text = gr.Textbox(
153
+ label="Enhanced text",
154
+ placeholder=enhanced_text_value,
155
+ show_copy_button=True,
156
+ )
157
+
158
+ gr.Button("Enhance").click(
159
+ inference,
160
+ concurrency_limit=concurrency_limit,
161
+ inputs=text,
162
+ outputs=enhanced_text,
163
+ )
164
+
165
+ with gr.Row():
166
+ gr.Examples(label="Choose an example", inputs=text, examples=examples)
167
+
168
+ gr.Markdown(description_foot)
169
+
170
+ gr.Markdown("### Gradio app uses the following technologies:")
171
+ with gr.Row():
172
+ gr.Markdown(tech_env)
173
+ gr.Markdown(tech_libraries)
174
+
175
+ if __name__ == "__main__":
176
+ demo.queue()
177
+ demo.launch()
requirements-dev.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ruff
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+
3
+ Cython
4
+ packaging
5
+
6
+ huggingface_hub<0.22.1
7
+ numpy<2
8
+
9
+ nemo_toolkit[nlp]