NCTCMumbai
commited on
Commit
•
7f7b0cd
1
Parent(s):
c7c88ee
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from rank_bm25 import BM25Okapi
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
gir='''The six General Rules of Interpretation (GIR) guide the classification of goods under the tariff. Rules 1 to 4 must be applied sequentially, while Rules 5 and 6 are standalone.
|
8 |
+
|
9 |
+
Rule 1: Titles and Terms
|
10 |
+
Titles: Section, Chapter, and sub-Chapter titles are for reference only.
|
11 |
+
Classification: Determine classification based on the headings, Section or Chapter Notes. Refer to GIRs 2 to 6 for further details.
|
12 |
+
Rule 2: Incomplete and Mixed Goods
|
13 |
+
Incomplete Goods (2a): Classification includes unfinished, incomplete, or disassembled items if they have the essential character of the finished product.
|
14 |
+
|
15 |
+
Mixed Goods (2b): A reference to a material includes mixtures or combinations with other materials. Goods with multiple materials are classified based on the principles of Rule 3.
|
16 |
+
|
17 |
+
Example: Dicalcium citrate, a compound with citric acid characteristics, is classified under 2918.15.90.19 as a salt of citric acid.
|
18 |
+
|
19 |
+
Rule 3: Classification Between Multiple Headings
|
20 |
+
Specific vs. General (3a): Prefer the heading with the most specific description if multiple headings apply.
|
21 |
+
|
22 |
+
Example: Mint tea is classified under tea as it provides a specific description, unlike mint alone.
|
23 |
+
|
24 |
+
Essential Character (3b): For mixtures or composite goods, classify based on the material or component giving the essential character.
|
25 |
+
|
26 |
+
Example: A liquor gift set is classified under the liquor heading because the liquor is the essential item.
|
27 |
+
|
28 |
+
Last Resort (3c): When essential character cannot be determined, classify under the last in numerical order among equally suitable headings.
|
29 |
+
|
30 |
+
Example: A gift set with socks and ties is classified under the tie heading as it comes last numerically.
|
31 |
+
|
32 |
+
Rule 4: Most Akin Goods
|
33 |
+
Last Resort: If goods cannot be classified using the above rules, classify them under the heading most akin to the goods.
|
34 |
+
Rule 5: Containers and Packing
|
35 |
+
Containers (5a): Containers specifically designed for an article and sold together with it are classified with the article unless the container defines the product’s essential character.
|
36 |
+
|
37 |
+
Example: Flute cases are classified with the flutes they contain.
|
38 |
+
|
39 |
+
Packing Materials (5b): Packing materials and containers are classified with the goods if they are not suitable for reuse and are of a kind normally used for packing.
|
40 |
+
|
41 |
+
Example: Styrofoam used for padding is classified with the goods it protects.
|
42 |
+
|
43 |
+
Rule 6: Subheadings
|
44 |
+
Classification: Classification at the subheading level follows the same rules as for headings, considering any related subheading notes and only comparing subheadings at the same level.'''
|
45 |
+
# Initialize the models
|
46 |
+
bm25 = None # BM25 will be initialized within the function
|
47 |
+
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
48 |
+
|
49 |
+
# Pre-compute BERT embeddings
|
50 |
+
def get_bert_embeddings(corpus, model):
|
51 |
+
return model.encode(corpus)
|
52 |
+
|
53 |
+
def load_embeddings(filename="/content/gdrive/MyDrive/hybrid search hs code/corpus_embeddings.npy"):
|
54 |
+
embeddings = np.load(filename)
|
55 |
+
return embeddings
|
56 |
+
|
57 |
+
# Perform BM25 and BERT search
|
58 |
+
def bm25_bert_search(query, df, corpus, bm25, bert_model, corpus_embeddings, top_n=10):
|
59 |
+
tokenized_corpus = [doc.split(" ") for doc in corpus]
|
60 |
+
tokenized_query = query.split(" ")
|
61 |
+
|
62 |
+
# BM25 search
|
63 |
+
if bm25 is None:
|
64 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
65 |
+
bm25_scores = bm25.get_scores(tokenized_query)
|
66 |
+
|
67 |
+
bm25_results = pd.DataFrame({
|
68 |
+
'Index': range(len(bm25_scores)),
|
69 |
+
'BM25_Score': bm25_scores,
|
70 |
+
'Concat Description': df['Concat Description'],
|
71 |
+
'CTH Code': df['CTH Code']
|
72 |
+
}).sort_values(by='BM25_Score', ascending=False).head(top_n)
|
73 |
+
|
74 |
+
# BERT search
|
75 |
+
query_embedding = bert_model.encode(query)
|
76 |
+
bert_scores = cosine_similarity([query_embedding], corpus_embeddings)[0]
|
77 |
+
|
78 |
+
bert_results = pd.DataFrame({
|
79 |
+
'Index': range(len(bert_scores)),
|
80 |
+
'BERT_Score': bert_scores,
|
81 |
+
'Concat Description': df['Concat Description'],
|
82 |
+
'CTH Code': df['CTH Code']
|
83 |
+
}).sort_values(by='BERT_Score', ascending=False).head(top_n)
|
84 |
+
|
85 |
+
# Combine BM25 and BERT results
|
86 |
+
combined_results = pd.concat([bm25_results[['Index', 'Concat Description', 'CTH Code']],
|
87 |
+
bert_results[['Index', 'Concat Description', 'CTH Code']]])
|
88 |
+
|
89 |
+
# Drop duplicates based on 'CTH Code'
|
90 |
+
combined_results = combined_results.drop_duplicates(subset=['CTH Code'])
|
91 |
+
|
92 |
+
# Create a string for each row and store them in a list
|
93 |
+
result_strings = []
|
94 |
+
for index, row in combined_results.iterrows():
|
95 |
+
result_strings.append(f"Description: {row['Concat Description']}, CTH Code: {row['CTH Code']}")
|
96 |
+
|
97 |
+
return result_strings
|
98 |
+
|
99 |
+
# Load precomputed embeddings
|
100 |
+
corpus_embeddings = load_embeddings()
|
101 |
+
|
102 |
+
def search_and_explain(query):
|
103 |
+
# Load your DataFrame df here
|
104 |
+
# For demonstration purposes, assuming df is loaded as a global variable
|
105 |
+
# Example: df = pd.read_csv('your_dataframe.csv')
|
106 |
+
|
107 |
+
# Replace the following line with actual loading of the DataFrame
|
108 |
+
global df
|
109 |
+
|
110 |
+
df = pd.read_csv('/content/gdrive/MyDrive/hybrid search hs code/CTH_Description (2).csv', on_bad_lines = 'skip' ) # Load your data
|
111 |
+
|
112 |
+
corpus = df['Concat Description'].tolist()
|
113 |
+
|
114 |
+
# Perform the search and get results as strings
|
115 |
+
result_strings = bm25_bert_search(query, df, corpus, bm25, bert_model, corpus_embeddings, top_n=10)
|
116 |
+
|
117 |
+
# Prepare the prompt for the API
|
118 |
+
prompt = f"Based on the descriptions:\n" + "\n".join(result_strings) + f"\nPlease choose the most suitable CTH code for the given product: '{query}'.Keep the GIR in mind while choosing'{gir}'.Explain the possibility of related CTH codes on certain conditions "
|
119 |
+
|
120 |
+
# Call the API
|
121 |
+
from gradio_client import Client
|
122 |
+
client = Client("Qwen/Qwen1.5-110B-Chat-demo")
|
123 |
+
response = client.predict(
|
124 |
+
query=prompt,
|
125 |
+
history=[],
|
126 |
+
system="You are a helpful assistant.",
|
127 |
+
api_name="/model_chat"
|
128 |
+
)
|
129 |
+
# Extract the API output text
|
130 |
+
api_output = response[1] if response and len(response) > 1 else "No output received from the API."
|
131 |
+
|
132 |
+
return api_output[0][1]
|
133 |
+
|
134 |
+
# Create the Gradio interface
|
135 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
136 |
+
gr.Markdown(
|
137 |
+
"""
|
138 |
+
# AI-powered Indian Customs Tariff Search with Explainability
|
139 |
+
This app uses a combination of semantic, keyword, and ColBERT models to search the Indian customs tariff and choose the best matching CTH code with the power of interpreting with General Rules of Interpretation (GIR).
|
140 |
+
"""
|
141 |
+
)
|
142 |
+
|
143 |
+
query_input = gr.Textbox(label="Enter Product Description", placeholder="e.g., fuel pump for elevator")
|
144 |
+
result_output = gr.Textbox(label="HSN Prediction with Explanation", lines=10)
|
145 |
+
|
146 |
+
query_input.submit(search_and_explain, inputs=query_input, outputs=result_output)
|
147 |
+
|
148 |
+
demo.launch(debug=True)
|