Spaces:

CaoHaiNam
/

address-standardization

Running

App Files Files Community

CaoHaiNam commited on Jul 13, 2024

Commit

3ca6892

1 Parent(s): 3a379e2

update code

Browse files

Files changed (3) hide show

parameters.py +4 -3
siameser.py +1 -13
utils.py +19 -3

parameters.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # transformer model
 embedding_model = 'CaoHaiNam/vietnamese-address-embedding'
-local_embedding_model = 'embedding-model'
 NORM_ADDS_FILE_ALL_1 = 'data/standard_address_all_1.json'
-STD_EMBEDDING_FILE_ALL_1 = 'data/address_matrix_all_1.pt'

 # transformer model
 embedding_model = 'CaoHaiNam/vietnamese-address-embedding'
 NORM_ADDS_FILE_ALL_1 = 'data/standard_address_all_1.json'
+STD_EMBEDDING_FILE_ALL_1 = 'data/address_matrix_all_1.pt'
+LOG_DIRECTORY = 'logs'
+LOG_RESULT_FILE = 'logs.json'

siameser.py CHANGED Viewed

@@ -13,14 +13,8 @@ device = torch.device('cpu')
 class Siameser:
     def __init__(self, model_name=None, stadard_scope=None):
-        # print('Load model')
         print("Load sentence embedding model (If this is the first time you run this repo, It could be take time to download sentence embedding model)")
         self.threshold = 0.61
-        # if os.path.isdir(parameters.local_embedding_model):
-        #     self.embedding_model = SentenceTransformer(parameters.local_embedding_model).to(device)
-        # else:
-        #     self.embedding_model = SentenceTransformer(parameters.embedding_model).to(device)
-        #     self.embedding_model.save(parameters.local_embedding_model)
         self.embedding_model = SentenceTransformer(parameters.embedding_model).to(device)
         if stadard_scope == 'all':
@@ -55,10 +49,8 @@ class Siameser:
         else:
             score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
         s, top_k = score.topk(1)
-        # print(s, top_k)
-        # return
         s, idx = s.tolist()[0], top_k.tolist()[0]
-        # if s < 0.57:
         if s < self.threshold:
             return {'Format Error': 'Xâu truyền vào không phải địa chỉ, mời nhập lại.'}
         std_add = self.NORM_ADDS[str(idx)]
@@ -75,8 +67,6 @@ class Siameser:
             score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
         s, top_k = score.topk(k)
         s, top_k = s.tolist(), top_k.tolist()
-        # print(s, top_k)
-        # return
         if s[0] < self.threshold:
             return {'Format Error': 'Dường như xâu truyền vào không phải địa chỉ, mời nhập lại.'}, {}
@@ -86,6 +76,4 @@ class Siameser:
             std_add = self.NORM_ADDS[str(idx)]
             top_std_adds.append(utils.get_full_result(raw_add_, std_add, round(score, 4)))
-        x1, x2 = top_std_adds[0], top_std_adds[1]
         return top_std_adds[0], top_std_adds

 class Siameser:
     def __init__(self, model_name=None, stadard_scope=None):
         print("Load sentence embedding model (If this is the first time you run this repo, It could be take time to download sentence embedding model)")
         self.threshold = 0.61
         self.embedding_model = SentenceTransformer(parameters.embedding_model).to(device)
         if stadard_scope == 'all':
         else:
             score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
         s, top_k = score.topk(1)
         s, idx = s.tolist()[0], top_k.tolist()[0]
         if s < self.threshold:
             return {'Format Error': 'Xâu truyền vào không phải địa chỉ, mời nhập lại.'}
         std_add = self.NORM_ADDS[str(idx)]
             score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
         s, top_k = score.topk(k)
         s, top_k = s.tolist(), top_k.tolist()
         if s[0] < self.threshold:
             return {'Format Error': 'Dường như xâu truyền vào không phải địa chỉ, mời nhập lại.'}, {}
             std_add = self.NORM_ADDS[str(idx)]
             top_std_adds.append(utils.get_full_result(raw_add_, std_add, round(score, 4)))
         return top_std_adds[0], top_std_adds

utils.py CHANGED Viewed

@@ -1,6 +1,9 @@
 # import numpy as np
 import re
 import string
 # delete tone and lower
 anphabet = ['a', 'ă', 'â', 'b', 'c', 'd',
@@ -39,10 +42,9 @@ def remove_accent(text):
 # remove functuation
 def remove_punctuation(text):
-    punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
     whitespace = ' '
     for i in text:
-        if i in punctuation:
             text = text.replace(i, whitespace)
     return ' '.join(text.split())
@@ -95,4 +97,18 @@ def get_full_result(raw_address, std_address, score):
     full_result['detail_address'] = get_detail_address(raw_address, std_address)
     full_result['main_address'] = std_address
     full_result['similarity_score'] = score
-    return full_result

 # import numpy as np
 import re
 import string
+import json
+from datetime import datetime
+from typing import Text, Dict
 # delete tone and lower
 anphabet = ['a', 'ă', 'â', 'b', 'c', 'd',
 # remove functuation
 def remove_punctuation(text):
     whitespace = ' '
     for i in text:
+        if i in string.punctuation:
             text = text.replace(i, whitespace)
     return ' '.join(text.split())
     full_result['detail_address'] = get_detail_address(raw_address, std_address)
     full_result['main_address'] = std_address
     full_result['similarity_score'] = score
+    return full_result
+def save_result(file_path: Text, result: Dict) -> None:
+    log_sample = dict()
+    log_sample['result'] = result
+    log_sample['created_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    logs = json.load(open(file_path, "r", encoding="utf8"))
+    logs.append(log_sample)
+    json.dump(
+        logs,
+        open(file_path, "w", encoding="utf8"),
+        ensure_ascii=False,
+        indent=4
+    )