stanslausmwongela commited on
Commit
57373b0
1 Parent(s): b0ed1a9

Updated Cleaning Text Function

Browse files
Files changed (1) hide show
  1. predict.py +36 -24
predict.py CHANGED
@@ -10,38 +10,50 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
10
  from torch.utils.data import TensorDataset, DataLoader
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class Preprocess:
14
  def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
16
  use_auth_token='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')
17
  self.max_len = tokenizer_max_len
18
 
19
  def clean_text(self, text):
20
  text = text.lower()
21
- stopwords = ["i", "was", "transferred",
22
- "from", "to", "nilienda", "kituo",
23
- "cha", "lakini", "saa", "hii", "niko",
24
- "at", "nilienda", "nikahudumiwa", "pole",
25
- "deliver", "na", "ni", "baada", "ya",
26
- "kutumwa", "kutoka", "nilienda",
27
- "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
28
- "mgonjwa", "nikatibiwa", "in", "had", "a",
29
- "visit", "gynaecologist", "ndio",
30
- "karibu", "mimi", "niko", "sehemu", "hospitali",
31
- "serikali", "delivered", "katika", "kaunti", "kujifungua",
32
- "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
33
- "sija", "maliza", "mwisho",
34
- "nilianza", "kliniki", "yangu",
35
- "nilianzia", "nilijifungua"]
36
- text_single = ' '.join(word for word in text.split() if word not in stopwords)
37
- return text_single
38
-
39
- def encode_fn(self, text_single):
40
  """
41
  Using tokenizer to preprocess the text
42
  example of text_single:'Nairobi Hospital'
43
  """
44
- tokenizer = self.tokenizer(text_single,
45
  padding=True,
46
  truncation=True,
47
  max_length=self.max_len,
@@ -51,15 +63,15 @@ class Preprocess:
51
  attention_mask = tokenizer['attention_mask']
52
  return input_ids, attention_mask
53
 
54
- def process_tokenizer(self, text_single):
55
  """
56
  Preprocess text and prepare dataloader for a single new sentence
57
  """
58
- input_ids, attention_mask = self.encode_fn(text_single)
 
59
  data = TensorDataset(input_ids, attention_mask)
60
  return data
61
 
62
-
63
  class Facility_Model:
64
  def __init__(self, facility_model_path: any,
65
  max_len: int):
@@ -107,7 +119,7 @@ class Facility_Model:
107
  """
108
  output_dict = {}
109
  # transform the relation table(between label and intent)
110
- path_table = pd.read_csv('dhis_label_relation_14357.csv')
111
 
112
  label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[
113
  'label']
 
10
  from torch.utils.data import TensorDataset, DataLoader
11
 
12
 
13
+ import os
14
+ import random
15
+ import json
16
+ import numpy as np
17
+ import torch
18
+ import heapq
19
+ import pandas as pd
20
+ from tqdm import tqdm
21
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
22
+ from torch.utils.data import TensorDataset, DataLoader
23
+
24
+
25
  class Preprocess:
26
  def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
27
+ self.stopwords = ["i", "was", "transferred",
28
+ "from", "to", "nilienda", "kituo",
29
+ "cha", "lakini", "saa", "hii", "niko",
30
+ "at", "nilienda", "nikahudumiwa", "pole",
31
+ "deliver", "na", "ni", "baada", "ya",
32
+ "kutumwa", "kutoka", "nilienda",
33
+ "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
34
+ "mgonjwa", "nikatibiwa", "in", "had", "a",
35
+ "visit", "gynaecologist", "ndio",
36
+ "karibu", "mimi", "niko", "sehemu", "hospitali",
37
+ "serikali", "delivered", "katika", "kaunti", "kujifungua",
38
+ "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
39
+ "sija", "maliza", "mwisho",
40
+ "nilianza", "kliniki", "yangu",
41
+ "nilianzia", "nilijifungua"]
42
  self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
43
  use_auth_token='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')
44
  self.max_len = tokenizer_max_len
45
 
46
  def clean_text(self, text):
47
  text = text.lower()
48
+ self.text_single = ' '.join(word for word in text.split() if word not in self.stopwords)
49
+ return self.text_single
50
+
51
+ def encode_fn(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
  Using tokenizer to preprocess the text
54
  example of text_single:'Nairobi Hospital'
55
  """
56
+ tokenizer = self.tokenizer(self.text_single,
57
  padding=True,
58
  truncation=True,
59
  max_length=self.max_len,
 
63
  attention_mask = tokenizer['attention_mask']
64
  return input_ids, attention_mask
65
 
66
+ def process_tokenizer(self, data):
67
  """
68
  Preprocess text and prepare dataloader for a single new sentence
69
  """
70
+ self.clean_text(data)
71
+ input_ids, attention_mask = self.encode_fn()
72
  data = TensorDataset(input_ids, attention_mask)
73
  return data
74
 
 
75
  class Facility_Model:
76
  def __init__(self, facility_model_path: any,
77
  max_len: int):
 
119
  """
120
  output_dict = {}
121
  # transform the relation table(between label and intent)
122
+ path_table = pd.read_csv('/content/drive/MyDrive/dhis14000/dhis_label_relation_14357.csv')
123
 
124
  label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[
125
  'label']