Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / app.py

NCTCMumbai

Update app.py

830c83b verified 9 months ago

raw

history blame contribute delete

8.54 kB

	import pandas as pd
	import numpy as np
	import tensorflow as tf
	import tensorflow_hub as hub
	import sys
	import random
	import os
	sys.path.append('models')
	from official.nlp.data import classifier_data_lib
	from official.nlp.bert import tokenization
	from official.nlp import optimization
	tf.get_logger().setLevel('ERROR')
	import math
	from datetime import datetime
	import gradio as gr


	config = tf.compat.v1.ConfigProto(
	device_count = {'cpu': 0}
	)
	sess = tf.compat.v1.Session(config=config)
	num_warmup_steps=1
	num_train_steps=1
	init_lr = 3e-5
	optimizer = optimization.create_optimizer(init_lr=init_lr,
	num_train_steps=num_train_steps,
	num_warmup_steps=num_warmup_steps,
	optimizer_type='adamw')

	### Load Model
	checkpoint_filepath=r'./Checkpoint'
	model = tf.keras.models.load_model(checkpoint_filepath, custom_objects={'KerasLayer':hub.KerasLayer , 'AdamWeightDecay': optimizer})

	df_report = pd.read_csv('./CTH_Description.csv')
	df_report['CTH Code'] = df_report['CTH Code'].astype(str).str.zfill(8)

	df_report_DUTY = pd.read_csv('./CTH_WISE_DUTY_RATE.csv')
	df_report_DUTY['CTH'] = df_report_DUTY['CTH'].astype(str).str.zfill(8)

	df = pd.read_csv("./CTH_CODE_MAP.csv")
	df['CTH'] = df['CTH'].astype(str).str.zfill(8)
	df = df[['CTH', 'code']]

	class_names=df[['CTH','code']].drop_duplicates(subset='CTH').sort_values(by='code',ignore_index=True)['CTH'].values.tolist()
	label_list=list(range(0,len(class_names)))
	max_seq_length = 200 # maximum length of (token) input sequences . it can be any number
	train_batch_size = 32 # batch size ( 16 choosen to avoid Out-Of-Memory errors)

	# Get BERT layer and tokenizer:
	# More details here: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
	bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4" , trainable = True)
	vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
	do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
	tokenizer = tokenization.FullTokenizer(vocab_file , do_lower_case)

	# This provides a function to convert each row to input features and label ( as required by BERT)

	max_seq_length = 200 # maximum length of (token) input sequences . it can be any number
	def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
	example = classifier_data_lib.InputExample(guid = None,
	text_a = text.numpy(),
	text_b = None,
	label = label.numpy())
	feature = classifier_data_lib.convert_single_example(0 , example , label_list , max_seq_length , tokenizer)

	return (feature.input_ids , feature.input_mask , feature.segment_ids , feature.label_id)


	def to_feature_map(text, label):
	input_ids , input_mask , segment_ids , label_id = tf.py_function(to_feature , inp = [text , label],
	Tout = [tf.int32 , tf.int32 , tf.int32 , tf.int32])

	input_ids.set_shape([max_seq_length])
	input_mask.set_shape([max_seq_length])
	segment_ids.set_shape([max_seq_length])
	label_id.set_shape([])

	x = {
	"input_word_ids": input_ids,
	"input_mask": input_mask,
	"input_type_ids": segment_ids
	}

	return(x,label_id)



	def print3largest(arr, arr_size):
	third = first = second = -sys.maxsize
	for i in range(0, arr_size):

	if (arr[i] > first):
	third = second
	second = first
	first = arr[i]
	elif (arr[i] > second):
	third = second
	second = arr[i]
	elif (arr[i] > third):
	third = arr[i]
	pred_value_max_three=[first, second, third]
	return pred_value_max_three

	def count_special_character(string):
	special_char= 0
	for i in range(len(string)):
	ch = string[i]
	if (string[i].isalpha()):
	continue
	else:
	special_char += 1

	if len(string)==special_char:
	return False
	else:
	return True

	def predict_CTH(txt):
	print('Desc: ',txt)
	if (txt!='') and len(txt)>=3 and (count_special_character(txt)):
	valid_data = tf.data.Dataset.from_tensor_slices(([txt] , [1])) # 1 refers to 'entertainment' and 2 refers to 'sport'
	valid_data = (valid_data.map(to_feature_map).batch(1))
	preds = model.predict(valid_data)
	predicted_values = tf.nn.softmax(preds)
	arr = predicted_values.numpy().tolist()[0]
	n = len(arr)
	pred_value_max_three=print3largest(arr, n)

	now = datetime.now()
	print("Time =", now)
	sum_all = pred_value_max_three[0] + pred_value_max_three[1] + pred_value_max_three[2]

	val_1 = pred_value_max_three[0]/sum_all
	val_2 = pred_value_max_three[1]/sum_all
	val_3 = pred_value_max_three[2]/sum_all

	if pred_value_max_three[0]<=0.000131:
	Var_CTH=[]
	Var_desc=[]
	Var_duty=[]
	pred_duty=''
	pred_desc=''
	pred_CTH=''

	return{'Not a adequate description':float(1.0)}
	else:
	Var_CTH=[]
	Var_desc=[]
	Var_duty=[]
	pred_duty=''
	pred_desc=''
	pred_CTH=''


	for i in pred_value_max_three:
	#i=pred_value_max_three[0]
	predicted_code=np.where(predicted_values.numpy()==i)[1][0]
	pred_CTH=df[df['code'] == predicted_code]['CTH'].iloc[0]

	try:
	pred_duty=df_report_DUTY[df_report_DUTY['CTH']==str(pred_CTH)]['DUTY_RATE'].iloc[0]
	except:
	pred_duty=''
	pass

	try:
	pred_desc=df_report[df_report['CTH Code']==str(pred_CTH)]['Concat Description'].iloc[0]
	except:
	pred_desc=''
	pass

	Var_CTH.append(pred_CTH)
	Var_desc.append(pred_desc)
	Var_duty.append(pred_duty)

	P1 ='CTH: '+str(Var_CTH[0])+' Duty Rate(%): '+ str(Var_duty[0])
	P2 ='CTH: '+str(Var_CTH[1])+' Duty Rate(%): '+ str(Var_duty[1])
	P3 ='CTH: '+str(Var_CTH[2])+' Duty Rate(%): '+ str(Var_duty[2])


	Q1='Desc: '+str(Var_desc[0])
	Q2='Desc: '+str(Var_desc[1])
	Q3='Desc: '+str(Var_desc[2])


	return {str(P1):float(val_1),str(Q1):float(val_1),
	str(P2):float(val_2),str(Q2):float(val_2),
	str(P3):float(val_3),str(Q3):float(val_3),}
	else:
	return{'Enter Correct Description':float(1.0)}


	input_txt=gr.Textbox(
	label='Enter Your Product Descrption',
	lines=3,
	)
	description="<p style='color:blue;text-align:justify;font-size:1vw;'>AdvaitBERT is modified version of BERT (Bidirectional Encoder Representation for Transformers), \
	finetuned on the Text corpus of Indian Customs Declarations. It is trained for performing \
	downstream tasks like automating the tariff classification and validation process of Customs \
	declarations in realtime. This model may help Customs administration to efficiently use AI assisted \
	NLP in realtime Customs process like Assessment, Post Clearance Audit, thereby highlighting classification \
	inconsistencies and help in revenue augmentation.</a></p>"

	title="<h1 style='color:green;text-align:center;font-size:2vw;'>AdvaitBERT </a></h1>"
	article="<p style='color:black;text-align:right;font-size:1vw;'>Powered by NCTC </a></p>"

	#css=".gradio-container {background-color: papayawhip}",


	path_2='./CTH_CODE_MAP.csv'
	# Get the absolute path by combining the current working directory with the relative path
	absolute_path_1 = os.path.abspath(checkpoint_filepath)
	absolute_path_2 = os.path.abspath(path_2)

	# Print the absolute path
	print("Absolute path:", absolute_path_1)

	blocked_files=[absolute_path_1,absolute_path_2]

	gr.Interface(
	predict_CTH,
	inputs=input_txt,
	outputs="label",
	interpretation="default",
	description=description,
	#live=True,
	examples = ['200 SI/SI/SI LPO ALUMINIUM LIDS (QTY: 8820000 PCS/PRICE: 21.'],
	title=title,
	article=article,
	).launch(debug=True,blocked_paths=blocked_files,)