import streamlit as st
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px
import re
from collections import Counter
from itertools import chain

# Set page configuration
st.set_page_config(page_title="Advanced Log Analytics", page_icon="🔍")

# Title of the app
st.title("Interactive Log Analytics with N-gram Keyword Extraction")

# Function to read log files with error handling for invalid UTF-8 sequences
def read_log_file(file):
    try:
        # Attempt to read the file with error handling
        return file.read().decode('utf-8', errors='replace')
    except Exception as e:
        st.error(f'Error reading file: {str(e)}')
        st.stop()

# Function to manually extract n-grams from log lines
STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not'])

def extract_ngrams(text, n=2):
    # Tokenize and filter tokens
    tokens = re.findall(r'\b\w+\b', text.lower())
    tokens_filtered = [token for token in tokens if token not in STOPWORDS]
    # Generate n-grams
    ngrams = zip(*[tokens_filtered[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

# File uploader for log files
uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"])

if uploaded_file is not None:
    # Read the log contents
    log_data = read_log_file(uploaded_file)

    st.write("### Log File Content Preview")
    st.text(log_data[:1000])  # Display the first 1000 characters

    # Preprocess log data (simple split by lines)
    log_lines = log_data.splitlines()

    # Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.)
    n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2)

    # Extract n-grams from each log line based on slider value
    ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines]

    # Flatten the list of n-grams and count their frequencies
    all_ngrams = list(chain.from_iterable(ngrams_from_logs))
    ngram_frequencies = Counter(all_ngrams)

    # Convert to DataFrame for viewing and sort by frequency
    ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency'])
    
    # Dropdown to select top or bottom N n-grams for visualization
    top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"])
    
    # Slider to select how many N-grams to display
    num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20)

    if top_bottom_choice == "Top N":
        selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display)
    else:
        selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display)

    # Display selected N-grams with frequencies
    st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams")
    st.dataframe(selected_ngrams_df)

    # Generate and display a word cloud of selected N-grams
    st.write("### N-gram Frequency Word Cloud")
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(
        {row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()}
    )

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    st.pyplot(plt)

    # Prepare data for heatmap: count occurrences of each selected n-gram per line number
    heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs],
                                columns=['N-gram', 'LineNumber'])
    
    heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])]
    
    heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram'])

    # Create a heatmap using Plotly
    st.write("### Heatmap of Line Number and Selected N-grams")
    
    fig = px.imshow(heatmap_matrix.T, 
                    labels=dict(x="Line Number", y="N-grams", color="Frequency"),
                    x=heatmap_matrix.index,
                    y=heatmap_matrix.columns,
                    aspect="auto",
                    color_continuous_scale='YlGnBu')

    fig.update_layout(title='Heatmap of Line Number vs Selected N-grams',
                      xaxis_title='Line Number',
                      yaxis_title='N-grams')

    st.plotly_chart(fig)

    # Dropdown to filter logs based on selected N-grams
    filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram']))
    
    if filter_ngram_choice != 'None':
        filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams]
        st.write(f"### Logs containing '{filter_ngram_choice}'")
        st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"]))

# Sidebar information
st.sidebar.title("About")
st.sidebar.info("""
This tool uses manual extraction of n-grams from log files to identify trends, anomalies, 
and potential issues through interactive visualization and analysis.
""")