import pandas as pd import numpy as np import tensorflow as tf import tensorflow_hub as hub import sys import random sys.path.append('models') from official.nlp.data import classifier_data_lib from official.nlp.bert import tokenization from official.nlp import optimization tf.get_logger().setLevel('ERROR') import math from datetime import datetime import gradio as gr config = tf.compat.v1.ConfigProto( device_count = {'cpu': 0} ) sess = tf.compat.v1.Session(config=config) num_warmup_steps=1 num_train_steps=1 init_lr = 3e-5 optimizer = optimization.create_optimizer(init_lr=init_lr, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer_type='adamw') ### Load Model checkpoint_filepath=r'./Checkpoint' model = tf.keras.models.load_model(checkpoint_filepath, custom_objects={'KerasLayer':hub.KerasLayer , 'AdamWeightDecay': optimizer}) df_report = pd.read_csv('./CTH_Description.csv') df_report['CTH Code'] = df_report['CTH Code'].astype(str).str.zfill(8) df_report_DUTY = pd.read_csv('./CTH_WISE_DUTY_RATE.csv') df_report_DUTY['CTH'] = df_report_DUTY['CTH'].astype(str).str.zfill(8) #print(df_report_DUTY) df = pd.read_csv("./CTH_CODE_MAP.csv") df['CTH'] = df['CTH'].astype(str).str.zfill(8) df = df[['CTH', 'code']] class_names=df[['CTH','code']].drop_duplicates(subset='CTH').sort_values(by='code',ignore_index=True)['CTH'].values.tolist() label_list=list(range(0,len(class_names))) max_seq_length = 200 # maximum length of (token) input sequences . it can be any number train_batch_size = 32 # batch size ( 16 choosen to avoid Out-Of-Memory errors) # Get BERT layer and tokenizer: # More details here: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4 bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4" , trainable = True) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = tokenization.FullTokenizer(vocab_file , do_lower_case) # This provides a function to convert each row to input features and label ( as required by BERT) max_seq_length = 200 # maximum length of (token) input sequences . it can be any number def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer): example = classifier_data_lib.InputExample(guid = None, text_a = text.numpy(), text_b = None, label = label.numpy()) feature = classifier_data_lib.convert_single_example(0 , example , label_list , max_seq_length , tokenizer) return (feature.input_ids , feature.input_mask , feature.segment_ids , feature.label_id) def to_feature_map(text, label): input_ids , input_mask , segment_ids , label_id = tf.py_function(to_feature , inp = [text , label], Tout = [tf.int32 , tf.int32 , tf.int32 , tf.int32]) input_ids.set_shape([max_seq_length]) input_mask.set_shape([max_seq_length]) segment_ids.set_shape([max_seq_length]) label_id.set_shape([]) x = { "input_word_ids": input_ids, "input_mask": input_mask, "input_type_ids": segment_ids } return(x,label_id) def print3largest(arr, arr_size): third = first = second = -sys.maxsize for i in range(0, arr_size): if (arr[i] > first): third = second second = first first = arr[i] elif (arr[i] > second): third = second second = arr[i] elif (arr[i] > third): third = arr[i] pred_value_max_three=[first, second, third] return pred_value_max_three def count_special_character(string): special_char= 0 for i in range(len(string)): ch = string[i] if (string[i].isalpha()): continue else: special_char += 1 if len(string)==special_char: return False else: return True def predict_CTH(txt): print('Desc: ',txt) if (txt!='') and len(txt)>=3 and (count_special_character(txt)): valid_data = tf.data.Dataset.from_tensor_slices(([txt] , [1])) # 1 refers to 'entertainment' and 2 refers to 'sport' valid_data = (valid_data.map(to_feature_map).batch(1)) preds = model.predict(valid_data) predicted_values = tf.nn.softmax(preds) arr = predicted_values.numpy().tolist()[0] n = len(arr) pred_value_max_three=print3largest(arr, n) now = datetime.now() print("Time =", now) sum_all = pred_value_max_three[0] + pred_value_max_three[1] + pred_value_max_three[2] val_1 = pred_value_max_three[0]/sum_all val_2 = pred_value_max_three[1]/sum_all val_3 = pred_value_max_three[2]/sum_all if pred_value_max_three[0]<=0.000131: Var_CTH=[] Var_desc=[] Var_duty=[] pred_duty='' pred_desc='' pred_CTH='' return{'Not a adequate description':float(1.0)} else: Var_CTH=[] Var_desc=[] Var_duty=[] pred_duty='' pred_desc='' pred_CTH='' for i in pred_value_max_three: #i=pred_value_max_three[0] predicted_code=np.where(predicted_values.numpy()==i)[1][0] pred_CTH=df[df['code'] == predicted_code]['CTH'].iloc[0] try: pred_duty=df_report_DUTY[df_report_DUTY['CTH']==str(pred_CTH)]['DUTY_RATE'].iloc[0] except: pred_duty='' pass try: pred_desc=df_report[df_report['CTH Code']==str(pred_CTH)]['Concat Description'].iloc[0] except: pred_desc='' pass Var_CTH.append(pred_CTH) Var_desc.append(pred_desc) Var_duty.append(pred_duty) P1 ='CTH: '+str(Var_CTH[0])+' Duty Rate(%): '+ str(Var_duty[0]) P2 ='CTH: '+str(Var_CTH[1])+' Duty Rate(%): '+ str(Var_duty[1]) P3 ='CTH: '+str(Var_CTH[2])+' Duty Rate(%): '+ str(Var_duty[2]) Q1='Desc: '+str(Var_desc[0]) Q2='Desc: '+str(Var_desc[1]) Q3='Desc: '+str(Var_desc[2]) return {str(P1):float(val_1),str(Q1):float(val_1), str(P2):float(val_2),str(Q2):float(val_2), str(P3):float(val_3),str(Q3):float(val_3),} else: return{'Enter Correct Description':float(1.0)} input_txt=gr.Textbox( label='Enter Your Product Descrption', lines=3, ) description="
AdvaitBERT is modified version of BERT (Bidirectional Encoder Representation for Transformers), \ finetuned on the Text corpus of Indian Customs Declarations. It is trained for performing \ downstream tasks like automating the tariff classification and validation process of Customs \ declarations in realtime. This model may help Customs administration to efficiently use AI assisted \ NLP in realtime Customs process like Assessment, Post Clearance Audit, thereby highlighting classification \ inconsistencies and help in revenue augmentation.
" title="Powered by NCTC
" #css=".gradio-container {background-color: papayawhip}", gr.Interface( predict_CTH, inputs=input_txt, outputs="label", interpretation="default", description=description, #live=True, examples = ['200 SI/SI/SI LPO ALUMINIUM LIDS (QTY: 8820000 PCS/PRICE: 21.'], title=title, article=article, ).launch()