File size: 4,816 Bytes
390550b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class NLQueryEngine:
    def __init__(self):
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer()

    def process_query(self, query, data):
        tokens = self.preprocess_query(query)
        intent = self.identify_intent(tokens)
        result = self.execute_query(intent, tokens, data)
        return result

    def preprocess_query(self, query):
        # Tokenize, remove stop words, and lemmatize
        tokens = word_tokenize(query.lower())
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        return tokens

    def identify_intent(self, tokens):
        intent_keywords = {
            'statistical': ['average', 'mean', 'median', 'mode', 'max', 'maximum', 'min', 'minimum', 'sum', 'count'],
            'comparison': ['compare', 'difference', 'versus', 'vs'],
            'trend': ['trend', 'over time', 'increase', 'decrease'],
            'distribution': ['distribution', 'spread', 'range'],
            'correlation': ['correlation', 'relationship', 'between']
        }

        for intent, keywords in intent_keywords.items():
            if any(keyword in tokens for keyword in keywords):
                return intent
        return 'general'

    def find_column(self, tokens, columns):
        # Use TF-IDF and cosine similarity to find the best matching column
        column_vectors = self.vectorizer.fit_transform(columns)
        query_vector = self.vectorizer.transform([' '.join(tokens)])
        similarities = cosine_similarity(query_vector, column_vectors).flatten()
        best_match_index = similarities.argmax()
        return columns[best_match_index] if similarities[best_match_index] > 0.1 else None

    def execute_query(self, intent, tokens, data):
        column = self.find_column(tokens, data.columns)
        if not column:
            return "I couldn't identify a relevant column in your query. Can you please specify the column name?"

        if intent == 'statistical':
            return self.statistical_query(tokens, data, column)
        elif intent == 'comparison':
            return self.comparison_query(tokens, data, column)
        elif intent == 'trend':
            return self.trend_query(data, column)
        elif intent == 'distribution':
            return self.distribution_query(data, column)
        elif intent == 'correlation':
            return self.correlation_query(data, column, tokens)
        else:
            return f"Here's a summary of {column}:\n{data[column].describe()}"

    def statistical_query(self, tokens, data, column):
        if 'average' in tokens or 'mean' in tokens:
            return f"The average of {column} is {data[column].mean():.2f}"
        elif 'median' in tokens:
            return f"The median of {column} is {data[column].median():.2f}"
        elif 'mode' in tokens:
            return f"The mode of {column} is {data[column].mode().values[0]}"
        elif 'maximum' in tokens or 'max' in tokens:
            return f"The maximum of {column} is {data[column].max():.2f}"
        elif 'minimum' in tokens or 'min' in tokens:
            return f"The minimum of {column} is {data[column].min():.2f}"
        elif 'sum' in tokens:
            return f"The sum of {column} is {data[column].sum():.2f}"
        elif 'count' in tokens:
            return f"The count of {column} is {data[column].count()}"

    def comparison_query(self, tokens, data, column):
        # Implement comparison logic here
        return f"Comparison analysis for {column} is not implemented yet."

    def trend_query(self, data, column):
        # Implement trend analysis logic here
        return f"Trend analysis for {column} is not implemented yet."

    def distribution_query(self, data, column):
        # Implement distribution analysis logic here
        return f"Distribution analysis for {column} is not implemented yet."

    def correlation_query(self, data, column1, tokens):
        column2 = self.find_column([token for token in tokens if token != column1], data.columns)
        if column2:
            correlation = data[column1].corr(data[column2])
            return f"The correlation between {column1} and {column2} is {correlation:.2f}"
        else:
            return f"I couldn't identify a second column to correlate with {column1}. Can you please specify two column names?"