import streamlit as st import pandas as pd from wordcloud import WordCloud import matplotlib.pyplot as plt import plotly.express as px import re from collections import Counter from itertools import chain # Set page configuration st.set_page_config(page_title="Advanced Log Analytics", page_icon="🔍") # Title of the app st.title("Interactive Log Analytics with N-gram Keyword Extraction") # Function to read log files with error handling for invalid UTF-8 sequences def read_log_file(file): try: # Attempt to read the file with error handling return file.read().decode('utf-8', errors='replace') except Exception as e: st.error(f'Error reading file: {str(e)}') st.stop() # Function to manually extract n-grams from log lines STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not']) def extract_ngrams(text, n=2): # Tokenize and filter tokens tokens = re.findall(r'\b\w+\b', text.lower()) tokens_filtered = [token for token in tokens if token not in STOPWORDS] # Generate n-grams ngrams = zip(*[tokens_filtered[i:] for i in range(n)]) return [' '.join(ngram) for ngram in ngrams] # File uploader for log files uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"]) if uploaded_file is not None: # Read the log contents log_data = read_log_file(uploaded_file) st.write("### Log File Content Preview") st.text(log_data[:1000]) # Display the first 1000 characters # Preprocess log data (simple split by lines) log_lines = log_data.splitlines() # Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.) n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2) # Extract n-grams from each log line based on slider value ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines] # Flatten the list of n-grams and count their frequencies all_ngrams = list(chain.from_iterable(ngrams_from_logs)) ngram_frequencies = Counter(all_ngrams) # Convert to DataFrame for viewing and sort by frequency ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency']) # Dropdown to select top or bottom N n-grams for visualization top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"]) # Slider to select how many N-grams to display num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20) if top_bottom_choice == "Top N": selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display) else: selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display) # Display selected N-grams with frequencies st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams") st.dataframe(selected_ngrams_df) # Generate and display a word cloud of selected N-grams st.write("### N-gram Frequency Word Cloud") wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies( {row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()} ) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') st.pyplot(plt) # Prepare data for heatmap: count occurrences of each selected n-gram per line number heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs], columns=['N-gram', 'LineNumber']) heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])] heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram']) # Create a heatmap using Plotly st.write("### Heatmap of Line Number and Selected N-grams") fig = px.imshow(heatmap_matrix.T, labels=dict(x="Line Number", y="N-grams", color="Frequency"), x=heatmap_matrix.index, y=heatmap_matrix.columns, aspect="auto", color_continuous_scale='YlGnBu') fig.update_layout(title='Heatmap of Line Number vs Selected N-grams', xaxis_title='Line Number', yaxis_title='N-grams') st.plotly_chart(fig) # Dropdown to filter logs based on selected N-grams filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram'])) if filter_ngram_choice != 'None': filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams] st.write(f"### Logs containing '{filter_ngram_choice}'") st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"])) # Sidebar information st.sidebar.title("About") st.sidebar.info(""" This tool uses manual extraction of n-grams from log files to identify trends, anomalies, and potential issues through interactive visualization and analysis. """)