File size: 6,926 Bytes
7f7b0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1079ea
7f7b0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1079ea
7f7b0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gradio as gr
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
gir='''The six General Rules of Interpretation (GIR) guide the classification of goods under the tariff. Rules 1 to 4 must be applied sequentially, while Rules 5 and 6 are standalone.

Rule 1: Titles and Terms
Titles: Section, Chapter, and sub-Chapter titles are for reference only.
Classification: Determine classification based on the headings, Section or Chapter Notes. Refer to GIRs 2 to 6 for further details.
Rule 2: Incomplete and Mixed Goods
Incomplete Goods (2a): Classification includes unfinished, incomplete, or disassembled items if they have the essential character of the finished product.

Mixed Goods (2b): A reference to a material includes mixtures or combinations with other materials. Goods with multiple materials are classified based on the principles of Rule 3.

Example: Dicalcium citrate, a compound with citric acid characteristics, is classified under 2918.15.90.19 as a salt of citric acid.

Rule 3: Classification Between Multiple Headings
Specific vs. General (3a): Prefer the heading with the most specific description if multiple headings apply.

Example: Mint tea is classified under tea as it provides a specific description, unlike mint alone.

Essential Character (3b): For mixtures or composite goods, classify based on the material or component giving the essential character.

Example: A liquor gift set is classified under the liquor heading because the liquor is the essential item.

Last Resort (3c): When essential character cannot be determined, classify under the last in numerical order among equally suitable headings.

Example: A gift set with socks and ties is classified under the tie heading as it comes last numerically.

Rule 4: Most Akin Goods
Last Resort: If goods cannot be classified using the above rules, classify them under the heading most akin to the goods.
Rule 5: Containers and Packing
Containers (5a): Containers specifically designed for an article and sold together with it are classified with the article unless the container defines the product’s essential character.

Example: Flute cases are classified with the flutes they contain.

Packing Materials (5b): Packing materials and containers are classified with the goods if they are not suitable for reuse and are of a kind normally used for packing.

Example: Styrofoam used for padding is classified with the goods it protects.

Rule 6: Subheadings
Classification: Classification at the subheading level follows the same rules as for headings, considering any related subheading notes and only comparing subheadings at the same level.'''
# Initialize the models
bm25 = None  # BM25 will be initialized within the function
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Pre-compute BERT embeddings
def get_bert_embeddings(corpus, model):
    return model.encode(corpus)

def load_embeddings(filename="corpus_embeddings.npy"):
    embeddings = np.load(filename)
    return embeddings

# Perform BM25 and BERT search
def bm25_bert_search(query, df, corpus, bm25, bert_model, corpus_embeddings, top_n=10):
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    tokenized_query = query.split(" ")

    # BM25 search
    if bm25 is None:
        bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(tokenized_query)
    
    bm25_results = pd.DataFrame({
        'Index': range(len(bm25_scores)),
        'BM25_Score': bm25_scores,
        'Concat Description': df['Concat Description'],
        'CTH Code': df['CTH Code']
    }).sort_values(by='BM25_Score', ascending=False).head(top_n)

    # BERT search
    query_embedding = bert_model.encode(query)
    bert_scores = cosine_similarity([query_embedding], corpus_embeddings)[0]

    bert_results = pd.DataFrame({
        'Index': range(len(bert_scores)),
        'BERT_Score': bert_scores,
        'Concat Description': df['Concat Description'],
        'CTH Code': df['CTH Code']
    }).sort_values(by='BERT_Score', ascending=False).head(top_n)

    # Combine BM25 and BERT results
    combined_results = pd.concat([bm25_results[['Index', 'Concat Description', 'CTH Code']],
                                  bert_results[['Index', 'Concat Description', 'CTH Code']]])

    # Drop duplicates based on 'CTH Code'
    combined_results = combined_results.drop_duplicates(subset=['CTH Code'])

    # Create a string for each row and store them in a list
    result_strings = []
    for index, row in combined_results.iterrows():
        result_strings.append(f"Description: {row['Concat Description']}, CTH Code: {row['CTH Code']}")
    
    return result_strings

# Load precomputed embeddings
corpus_embeddings = load_embeddings()

def search_and_explain(query):
    # Load your DataFrame df here
    # For demonstration purposes, assuming df is loaded as a global variable
    # Example: df = pd.read_csv('your_dataframe.csv')
    
    # Replace the following line with actual loading of the DataFrame
    global df
   
    df = pd.read_csv('CTH_Description (2).csv', on_bad_lines = 'skip' ) # Load your data

    corpus = df['Concat Description'].tolist()
    
    # Perform the search and get results as strings
    result_strings = bm25_bert_search(query, df, corpus, bm25, bert_model, corpus_embeddings, top_n=10)
    
    # Prepare the prompt for the API
    prompt = f"Based on the descriptions:\n" + "\n".join(result_strings) + f"\nPlease choose the most suitable CTH code for the given product: '{query}'.Keep the GIR in mind while choosing'{gir}'.Explain the possibility of related CTH codes on certain conditions "
    
    # Call the API
    from gradio_client import Client
    client = Client("Qwen/Qwen1.5-110B-Chat-demo")
    response = client.predict(
        query=prompt,
        history=[],
        system="You are a helpful assistant.",
        api_name="/model_chat"
    )
     # Extract the API output text
    api_output = response[1] if response and len(response) > 1 else "No output received from the API."
    
    return api_output[0][1]

# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # AI-powered Indian Customs Tariff Search with Explainability
        This app uses a combination of semantic, keyword, and ColBERT models to search the Indian customs tariff and choose the best matching CTH code with the power of interpreting with General Rules of Interpretation (GIR).
        """
    )
    
    query_input = gr.Textbox(label="Enter Product Description", placeholder="e.g., fuel pump for elevator")
    result_output = gr.Textbox(label="HSN Prediction with Explanation", lines=10)
    
    query_input.submit(search_and_explain, inputs=query_input, outputs=result_output)

demo.launch(debug=True)