NCTCMumbai commited on
Commit
7f7b0cd
1 Parent(s): c7c88ee

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -0
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from rank_bm25 import BM25Okapi
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ gir='''The six General Rules of Interpretation (GIR) guide the classification of goods under the tariff. Rules 1 to 4 must be applied sequentially, while Rules 5 and 6 are standalone.
8
+
9
+ Rule 1: Titles and Terms
10
+ Titles: Section, Chapter, and sub-Chapter titles are for reference only.
11
+ Classification: Determine classification based on the headings, Section or Chapter Notes. Refer to GIRs 2 to 6 for further details.
12
+ Rule 2: Incomplete and Mixed Goods
13
+ Incomplete Goods (2a): Classification includes unfinished, incomplete, or disassembled items if they have the essential character of the finished product.
14
+
15
+ Mixed Goods (2b): A reference to a material includes mixtures or combinations with other materials. Goods with multiple materials are classified based on the principles of Rule 3.
16
+
17
+ Example: Dicalcium citrate, a compound with citric acid characteristics, is classified under 2918.15.90.19 as a salt of citric acid.
18
+
19
+ Rule 3: Classification Between Multiple Headings
20
+ Specific vs. General (3a): Prefer the heading with the most specific description if multiple headings apply.
21
+
22
+ Example: Mint tea is classified under tea as it provides a specific description, unlike mint alone.
23
+
24
+ Essential Character (3b): For mixtures or composite goods, classify based on the material or component giving the essential character.
25
+
26
+ Example: A liquor gift set is classified under the liquor heading because the liquor is the essential item.
27
+
28
+ Last Resort (3c): When essential character cannot be determined, classify under the last in numerical order among equally suitable headings.
29
+
30
+ Example: A gift set with socks and ties is classified under the tie heading as it comes last numerically.
31
+
32
+ Rule 4: Most Akin Goods
33
+ Last Resort: If goods cannot be classified using the above rules, classify them under the heading most akin to the goods.
34
+ Rule 5: Containers and Packing
35
+ Containers (5a): Containers specifically designed for an article and sold together with it are classified with the article unless the container defines the product’s essential character.
36
+
37
+ Example: Flute cases are classified with the flutes they contain.
38
+
39
+ Packing Materials (5b): Packing materials and containers are classified with the goods if they are not suitable for reuse and are of a kind normally used for packing.
40
+
41
+ Example: Styrofoam used for padding is classified with the goods it protects.
42
+
43
+ Rule 6: Subheadings
44
+ Classification: Classification at the subheading level follows the same rules as for headings, considering any related subheading notes and only comparing subheadings at the same level.'''
45
+ # Initialize the models
46
+ bm25 = None # BM25 will be initialized within the function
47
+ bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
48
+
49
+ # Pre-compute BERT embeddings
50
+ def get_bert_embeddings(corpus, model):
51
+ return model.encode(corpus)
52
+
53
+ def load_embeddings(filename="/content/gdrive/MyDrive/hybrid search hs code/corpus_embeddings.npy"):
54
+ embeddings = np.load(filename)
55
+ return embeddings
56
+
57
+ # Perform BM25 and BERT search
58
+ def bm25_bert_search(query, df, corpus, bm25, bert_model, corpus_embeddings, top_n=10):
59
+ tokenized_corpus = [doc.split(" ") for doc in corpus]
60
+ tokenized_query = query.split(" ")
61
+
62
+ # BM25 search
63
+ if bm25 is None:
64
+ bm25 = BM25Okapi(tokenized_corpus)
65
+ bm25_scores = bm25.get_scores(tokenized_query)
66
+
67
+ bm25_results = pd.DataFrame({
68
+ 'Index': range(len(bm25_scores)),
69
+ 'BM25_Score': bm25_scores,
70
+ 'Concat Description': df['Concat Description'],
71
+ 'CTH Code': df['CTH Code']
72
+ }).sort_values(by='BM25_Score', ascending=False).head(top_n)
73
+
74
+ # BERT search
75
+ query_embedding = bert_model.encode(query)
76
+ bert_scores = cosine_similarity([query_embedding], corpus_embeddings)[0]
77
+
78
+ bert_results = pd.DataFrame({
79
+ 'Index': range(len(bert_scores)),
80
+ 'BERT_Score': bert_scores,
81
+ 'Concat Description': df['Concat Description'],
82
+ 'CTH Code': df['CTH Code']
83
+ }).sort_values(by='BERT_Score', ascending=False).head(top_n)
84
+
85
+ # Combine BM25 and BERT results
86
+ combined_results = pd.concat([bm25_results[['Index', 'Concat Description', 'CTH Code']],
87
+ bert_results[['Index', 'Concat Description', 'CTH Code']]])
88
+
89
+ # Drop duplicates based on 'CTH Code'
90
+ combined_results = combined_results.drop_duplicates(subset=['CTH Code'])
91
+
92
+ # Create a string for each row and store them in a list
93
+ result_strings = []
94
+ for index, row in combined_results.iterrows():
95
+ result_strings.append(f"Description: {row['Concat Description']}, CTH Code: {row['CTH Code']}")
96
+
97
+ return result_strings
98
+
99
+ # Load precomputed embeddings
100
+ corpus_embeddings = load_embeddings()
101
+
102
+ def search_and_explain(query):
103
+ # Load your DataFrame df here
104
+ # For demonstration purposes, assuming df is loaded as a global variable
105
+ # Example: df = pd.read_csv('your_dataframe.csv')
106
+
107
+ # Replace the following line with actual loading of the DataFrame
108
+ global df
109
+
110
+ df = pd.read_csv('/content/gdrive/MyDrive/hybrid search hs code/CTH_Description (2).csv', on_bad_lines = 'skip' ) # Load your data
111
+
112
+ corpus = df['Concat Description'].tolist()
113
+
114
+ # Perform the search and get results as strings
115
+ result_strings = bm25_bert_search(query, df, corpus, bm25, bert_model, corpus_embeddings, top_n=10)
116
+
117
+ # Prepare the prompt for the API
118
+ prompt = f"Based on the descriptions:\n" + "\n".join(result_strings) + f"\nPlease choose the most suitable CTH code for the given product: '{query}'.Keep the GIR in mind while choosing'{gir}'.Explain the possibility of related CTH codes on certain conditions "
119
+
120
+ # Call the API
121
+ from gradio_client import Client
122
+ client = Client("Qwen/Qwen1.5-110B-Chat-demo")
123
+ response = client.predict(
124
+ query=prompt,
125
+ history=[],
126
+ system="You are a helpful assistant.",
127
+ api_name="/model_chat"
128
+ )
129
+ # Extract the API output text
130
+ api_output = response[1] if response and len(response) > 1 else "No output received from the API."
131
+
132
+ return api_output[0][1]
133
+
134
+ # Create the Gradio interface
135
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
136
+ gr.Markdown(
137
+ """
138
+ # AI-powered Indian Customs Tariff Search with Explainability
139
+ This app uses a combination of semantic, keyword, and ColBERT models to search the Indian customs tariff and choose the best matching CTH code with the power of interpreting with General Rules of Interpretation (GIR).
140
+ """
141
+ )
142
+
143
+ query_input = gr.Textbox(label="Enter Product Description", placeholder="e.g., fuel pump for elevator")
144
+ result_output = gr.Textbox(label="HSN Prediction with Explanation", lines=10)
145
+
146
+ query_input.submit(search_and_explain, inputs=query_input, outputs=result_output)
147
+
148
+ demo.launch(debug=True)