htahir1 commited on
Commit
9da8c80
1 Parent(s): d0debd0

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ # Set up a new user named "user" with user ID 1000
13
+ RUN useradd -m -u 1000 user
14
+ # Switch to the "user" user
15
+ USER user
16
+ # Set home to the user's home directory
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+
20
+ # Set the working directory to the user's home directory
21
+ WORKDIR $HOME/app
22
+
23
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
24
+ COPY --chown=user . $HOME/app
25
+
26
+ CMD ["python", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Apache Software License 2.0
2
+ #
3
+ # Copyright (c) ZenML GmbH 2023. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Apache Software License 2.0
2
+ #
3
+ # Copyright (c) ZenML GmbH 2023. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ from typing import Optional
17
+ import click
18
+ import numpy as np
19
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
20
+ from os.path import dirname
21
+
22
+ import gradio as gr
23
+
24
+
25
+ @click.command()
26
+ @click.option(
27
+ "--tokenizer_name_or_path",
28
+ default="tokenizer",
29
+ help="Name or the path of the tokenizer.",
30
+ )
31
+ @click.option(
32
+ "--model_name_or_path", default="model", help="Name or the path of the model."
33
+ )
34
+ @click.option(
35
+ "--labels", default="Negative,Positive", help="Comma-separated list of labels."
36
+ )
37
+ @click.option(
38
+ "--title", default="ZenML NLP Use-Case", help="Title of the Gradio interface."
39
+ )
40
+ @click.option(
41
+ "--description",
42
+ default="Text Classification - Sentiment Analysis - ZenML - Gradio",
43
+ help="Description of the Gradio interface.",
44
+ )
45
+ @click.option(
46
+ "--interpretation",
47
+ default="default",
48
+ help="Interpretation mode for the Gradio interface.",
49
+ )
50
+ @click.option(
51
+ "--examples",
52
+ default="This is an awesome journey, I love it!",
53
+ help="Comma-separated list of examples to show in the Gradio interface.",
54
+ )
55
+ def sentiment_analysis(
56
+ tokenizer_name_or_path: Optional[str],
57
+ model_name_or_path: Optional[str],
58
+ labels: Optional[str],
59
+ title: Optional[str],
60
+ description: Optional[str],
61
+ interpretation: Optional[str],
62
+ examples: Optional[str],
63
+ ):
64
+ """Launches a Gradio interface for sentiment analysis.
65
+
66
+ This function launches a Gradio interface for text-classification.
67
+ It loads a model and a tokenizer from the provided paths and uses
68
+ them to predict the sentiment of the input text.
69
+
70
+ Args:
71
+ tokenizer_name_or_path (str): Name or the path of the tokenizer.
72
+ model_name_or_path (str): Name or the path of the model.
73
+ labels (str): Comma-separated list of labels.
74
+ title (str): Title of the Gradio interface.
75
+ description (str): Description of the Gradio interface.
76
+ interpretation (str): Interpretation mode for the Gradio interface.
77
+ examples (str): Comma-separated list of examples to show in the Gradio interface.
78
+ """
79
+ labels = labels.split(",")
80
+ examples = [examples]
81
+
82
+ def preprocess(text: str) -> str:
83
+ """Preprocesses the text.
84
+
85
+ Args:
86
+ text (str): Input text.
87
+
88
+ Returns:
89
+ str: Preprocessed text.
90
+ """
91
+ new_text = []
92
+ for t in text.split(" "):
93
+ t = "@user" if t.startswith("@") and len(t) > 1 else t
94
+ t = "http" if t.startswith("http") else t
95
+ new_text.append(t)
96
+ return " ".join(new_text)
97
+
98
+ def softmax(x):
99
+ e_x = np.exp(x - np.max(x))
100
+ return e_x / e_x.sum(axis=0)
101
+
102
+ def analyze_text(text):
103
+ model_path = f"{dirname(__file__)}/{model_name_or_path}/"
104
+ print(f"Loading model from {model_path}")
105
+ tokenizer_path = f"{dirname(__file__)}/{tokenizer_name_or_path}/"
106
+ print(f"Loading tokenizer from {tokenizer_path}")
107
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
108
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
109
+
110
+ text = preprocess(text)
111
+ encoded_input = tokenizer(text, return_tensors="pt")
112
+ output = model(**encoded_input)
113
+ scores_ = output[0][0].detach().numpy()
114
+ scores_ = softmax(scores_)
115
+
116
+ scores = {l: float(s) for (l, s) in zip(labels, scores_)}
117
+ return scores
118
+
119
+ demo = gr.Interface(
120
+ fn=analyze_text,
121
+ inputs=[gr.TextArea("Write your text or tweet here", label="Analyze Text")],
122
+ outputs=["label"],
123
+ title=title,
124
+ description=description,
125
+ interpretation=interpretation,
126
+ examples=examples,
127
+ )
128
+
129
+ demo.launch(share=True, debug=True)
130
+
131
+
132
+ if __name__ == "__main__":
133
+ sentiment_analysis()
model/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/htahir1/.config/zenml/local_stores/aa26dca1-2120-4d02-9ca7-72b72126f32e/mlruns/884616099654804863/0f1dfd9479b54e5cb470d37c90e45feb/artifacts/nlp_use_case_model/model",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "problem_type": "single_label_classification",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.34.1",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 50265
28
+ }
model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ee8854b001b8c0848b90e95c08468835f755ea35b440b53936b7deed22d529a
3
+ size 498655278
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nltk
2
+ torch
3
+ torchvision
4
+ torchaudio
5
+ gradio
6
+ datasets==2.12.0
7
+ numpy==1.22.4
8
+ pandas==1.5.3
9
+ session_info==1.0.0
10
+ scikit-learn==1.2.2
11
+ transformers==4.28.1
12
+ IPython==7.34.0
serve.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Task name (optional), used for display purposes.
2
+ name: ZenML NLP project}
3
+
4
+ resources:
5
+ cloud: gcp # The cloud to use (optional).
6
+ # Working directory (optional), synced to ~/sky_workdir on the remote cluster
7
+ # each time launch or exec is run with the yaml file.
8
+ #
9
+ # Commands in "setup" and "run" will be executed under it.
10
+ #
11
+ # If a .gitignore file (or a .git/info/exclude file) exists in the working
12
+ # directory, files and directories listed in it will be excluded from syncing.
13
+ workdir: ./gradio
14
+
15
+ setup: |
16
+ echo "Begin setup."
17
+ pip install -r requirements.txt
18
+ echo "Setup complete."
19
+
20
+ run: |
21
+ echo 'Starting gradio app...'
22
+ python app.py
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "do_lower_case": true,
49
+ "eos_token": "</s>",
50
+ "errors": "replace",
51
+ "mask_token": "<mask>",
52
+ "model_max_length": 512,
53
+ "pad_token": "<pad>",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff