Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
import plotly.express as px | |
import re | |
from collections import Counter | |
from itertools import chain | |
# Set page configuration | |
st.set_page_config(page_title="Advanced Log Analytics", page_icon="π") | |
# Title of the app | |
st.title("Interactive Log Analytics with N-gram Keyword Extraction") | |
# Function to read log files with error handling for invalid UTF-8 sequences | |
def read_log_file(file): | |
try: | |
# Attempt to read the file with error handling | |
return file.read().decode('utf-8', errors='replace') | |
except Exception as e: | |
st.error(f'Error reading file: {str(e)}') | |
st.stop() | |
# Function to manually extract n-grams from log lines | |
STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not']) | |
def extract_ngrams(text, n=2): | |
# Tokenize and filter tokens | |
tokens = re.findall(r'\b\w+\b', text.lower()) | |
tokens_filtered = [token for token in tokens if token not in STOPWORDS] | |
# Generate n-grams | |
ngrams = zip(*[tokens_filtered[i:] for i in range(n)]) | |
return [' '.join(ngram) for ngram in ngrams] | |
# File uploader for log files | |
uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"]) | |
if uploaded_file is not None: | |
# Read the log contents | |
log_data = read_log_file(uploaded_file) | |
st.write("### Log File Content Preview") | |
st.text(log_data[:1000]) # Display the first 1000 characters | |
# Preprocess log data (simple split by lines) | |
log_lines = log_data.splitlines() | |
# Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.) | |
n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2) | |
# Extract n-grams from each log line based on slider value | |
ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines] | |
# Flatten the list of n-grams and count their frequencies | |
all_ngrams = list(chain.from_iterable(ngrams_from_logs)) | |
ngram_frequencies = Counter(all_ngrams) | |
# Convert to DataFrame for viewing and sort by frequency | |
ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency']) | |
# Dropdown to select top or bottom N n-grams for visualization | |
top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"]) | |
# Slider to select how many N-grams to display | |
num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20) | |
if top_bottom_choice == "Top N": | |
selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display) | |
else: | |
selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display) | |
# Display selected N-grams with frequencies | |
st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams") | |
st.dataframe(selected_ngrams_df) | |
# Generate and display a word cloud of selected N-grams | |
st.write("### N-gram Frequency Word Cloud") | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies( | |
{row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()} | |
) | |
plt.figure(figsize=(10, 5)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
st.pyplot(plt) | |
# Prepare data for heatmap: count occurrences of each selected n-gram per line number | |
heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs], | |
columns=['N-gram', 'LineNumber']) | |
heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])] | |
heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram']) | |
# Create a heatmap using Plotly | |
st.write("### Heatmap of Line Number and Selected N-grams") | |
fig = px.imshow(heatmap_matrix.T, | |
labels=dict(x="Line Number", y="N-grams", color="Frequency"), | |
x=heatmap_matrix.index, | |
y=heatmap_matrix.columns, | |
aspect="auto", | |
color_continuous_scale='YlGnBu') | |
fig.update_layout(title='Heatmap of Line Number vs Selected N-grams', | |
xaxis_title='Line Number', | |
yaxis_title='N-grams') | |
st.plotly_chart(fig) | |
# Dropdown to filter logs based on selected N-grams | |
filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram'])) | |
if filter_ngram_choice != 'None': | |
filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams] | |
st.write(f"### Logs containing '{filter_ngram_choice}'") | |
st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"])) | |
# Sidebar information | |
st.sidebar.title("About") | |
st.sidebar.info(""" | |
This tool uses manual extraction of n-grams from log files to identify trends, anomalies, | |
and potential issues through interactive visualization and analysis. | |
""") | |