File size: 4,604 Bytes
147e44c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import json

from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.text_field_embedders.basic_text_field_embedder import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.embedding import Embedding
from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
from allennlp.models.archival import archive_model, load_archive
from allennlp_models.rc.modules.seq2seq_encoders.stacked_self_attention import StackedSelfAttentionEncoder
from allennlp.predictors.predictor import Predictor
from allennlp.predictors.text_classifier import TextClassifierPredictor
import gradio as gr
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from project_settings import project_path
from toolbox.allennlp_models.text_classifier.models.hierarchical_text_classifier import HierarchicalClassifier
from toolbox.allennlp_models.text_classifier.dataset_readers.hierarchical_classification_json import HierarchicalClassificationJsonReader


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--excel_file",
        default=r"D:\Users\tianx\PycharmProjects\telemarketing_intent\data\excel\telemarketing_intent_vi.xlsx",
        type=str,
    )
    parser.add_argument(
        "--archive_file",
        default=(project_path / "trained_models/telemarketing_intent_classification_vi").as_posix(),
        type=str
    )
    parser.add_argument(
        "--predictor_name",
        default="text_classifier",
        type=str
    )
    parser.add_argument(
        "--top_k",
        default=10,
        type=int
    )
    parser.add_argument(
        "--output_file",
        default="intent_top_k.jsonl",
        type=str
    )
    args = parser.parse_args()
    return args


def main():
    args = get_args()

    archive = load_archive(archive_file=args.archive_file)
    predictor = Predictor.from_archive(archive, predictor_name=args.predictor_name)

    df = pd.read_excel(args.excel_file)

    with open(args.output_file, "w", encoding="utf-8") as f:
        for i, row in tqdm(df.iterrows(), total=len(df)):
            if i < 26976:
                continue

            source = row["source"]
            text = row["text"]
            label0 = row["label0"]
            label1 = row["label1"]
            selected = row["selected"]
            checked = row["checked"]

            if pd.isna(source) or source is None:
                source = None

            if pd.isna(text) or text is None:
                continue
            text = str(text)

            if pd.isna(label0) or label0 is None:
                label0 = None

            if pd.isna(label1) or label1 is None:
                label1 = None

            if pd.isna(selected) or selected is None:
                selected = None
            else:
                try:
                    selected = int(selected)
                except Exception:
                    print(type(selected))
                    selected = None

            if pd.isna(checked) or checked is None:
                checked = None
            else:
                try:
                    checked = int(checked)
                except Exception:
                    print(type(checked))
                    checked = None

            # print(text)
            json_dict = {'sentence': text}
            outputs = predictor.predict_json(
                json_dict
            )
            probs = outputs["probs"]
            arg_idx = np.argsort(probs)

            arg_idx_top_k = arg_idx[-10:]
            label_top_k = [
                predictor._model.vocab.get_token_from_index(index=idx, namespace="labels").split("_")[-1] for idx in arg_idx_top_k
            ]
            prob_top_k = [
                str(round(probs[idx], 5)) for idx in arg_idx_top_k
            ]

            row_ = {
                "source": source,
                "text": text,
                "label0": label0,
                "label1": label1,
                "selected": selected,
                "checked": checked,
                "predict_label_top_k": ";".join(list(reversed(label_top_k))),
                "predict_prob_top_k": ";".join(list(reversed(prob_top_k)))
            }
            row_ = json.dumps(row_, ensure_ascii=False)
            f.write("{}\n".format(row_))

    return


if __name__ == '__main__':
    main()