DexterSptizu's picture
Create app.py
46155e1 verified
import streamlit as st
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px
import re
from collections import Counter
from itertools import chain
# Set page configuration
st.set_page_config(page_title="Advanced Log Analytics", page_icon="πŸ”")
# Title of the app
st.title("Interactive Log Analytics with N-gram Keyword Extraction")
# Function to read log files with error handling for invalid UTF-8 sequences
def read_log_file(file):
try:
# Attempt to read the file with error handling
return file.read().decode('utf-8', errors='replace')
except Exception as e:
st.error(f'Error reading file: {str(e)}')
st.stop()
# Function to manually extract n-grams from log lines
STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not'])
def extract_ngrams(text, n=2):
# Tokenize and filter tokens
tokens = re.findall(r'\b\w+\b', text.lower())
tokens_filtered = [token for token in tokens if token not in STOPWORDS]
# Generate n-grams
ngrams = zip(*[tokens_filtered[i:] for i in range(n)])
return [' '.join(ngram) for ngram in ngrams]
# File uploader for log files
uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"])
if uploaded_file is not None:
# Read the log contents
log_data = read_log_file(uploaded_file)
st.write("### Log File Content Preview")
st.text(log_data[:1000]) # Display the first 1000 characters
# Preprocess log data (simple split by lines)
log_lines = log_data.splitlines()
# Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.)
n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2)
# Extract n-grams from each log line based on slider value
ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines]
# Flatten the list of n-grams and count their frequencies
all_ngrams = list(chain.from_iterable(ngrams_from_logs))
ngram_frequencies = Counter(all_ngrams)
# Convert to DataFrame for viewing and sort by frequency
ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency'])
# Dropdown to select top or bottom N n-grams for visualization
top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"])
# Slider to select how many N-grams to display
num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20)
if top_bottom_choice == "Top N":
selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display)
else:
selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display)
# Display selected N-grams with frequencies
st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams")
st.dataframe(selected_ngrams_df)
# Generate and display a word cloud of selected N-grams
st.write("### N-gram Frequency Word Cloud")
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(
{row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()}
)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(plt)
# Prepare data for heatmap: count occurrences of each selected n-gram per line number
heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs],
columns=['N-gram', 'LineNumber'])
heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])]
heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram'])
# Create a heatmap using Plotly
st.write("### Heatmap of Line Number and Selected N-grams")
fig = px.imshow(heatmap_matrix.T,
labels=dict(x="Line Number", y="N-grams", color="Frequency"),
x=heatmap_matrix.index,
y=heatmap_matrix.columns,
aspect="auto",
color_continuous_scale='YlGnBu')
fig.update_layout(title='Heatmap of Line Number vs Selected N-grams',
xaxis_title='Line Number',
yaxis_title='N-grams')
st.plotly_chart(fig)
# Dropdown to filter logs based on selected N-grams
filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram']))
if filter_ngram_choice != 'None':
filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams]
st.write(f"### Logs containing '{filter_ngram_choice}'")
st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"]))
# Sidebar information
st.sidebar.title("About")
st.sidebar.info("""
This tool uses manual extraction of n-grams from log files to identify trends, anomalies,
and potential issues through interactive visualization and analysis.
""")