Spaces:

DexterSptizu
/

log-analytics-ngram

Sleeping

App Files Files Community

log-analytics-ngram / app.py

DexterSptizu

Create app.py

46155e1 verified 4 months ago

raw

history blame contribute delete

5.26 kB

	import streamlit as st
	import pandas as pd
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	import plotly.express as px
	import re
	from collections import Counter
	from itertools import chain

	# Set page configuration
	st.set_page_config(page_title="Advanced Log Analytics", page_icon="🔍")

	# Title of the app
	st.title("Interactive Log Analytics with N-gram Keyword Extraction")

	# Function to read log files with error handling for invalid UTF-8 sequences
	def read_log_file(file):
	try:
	# Attempt to read the file with error handling
	return file.read().decode('utf-8', errors='replace')
	except Exception as e:
	st.error(f'Error reading file: {str(e)}')
	st.stop()

	# Function to manually extract n-grams from log lines
	STOPWORDS = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'with', 'on', 'this', 'as', 'that', 'by', 'from', 'at', 'are', 'it', 'was', 'an', 'be', 'will', 'or', 'but', 'not'])

	def extract_ngrams(text, n=2):
	# Tokenize and filter tokens
	tokens = re.findall(r'\b\w+\b', text.lower())
	tokens_filtered = [token for token in tokens if token not in STOPWORDS]
	# Generate n-grams
	ngrams = zip(*[tokens_filtered[i:] for i in range(n)])
	return [' '.join(ngram) for ngram in ngrams]

	# File uploader for log files
	uploaded_file = st.file_uploader("Upload your log file", type=["txt", "log"])

	if uploaded_file is not None:
	# Read the log contents
	log_data = read_log_file(uploaded_file)

	st.write("### Log File Content Preview")
	st.text(log_data[:1000]) # Display the first 1000 characters

	# Preprocess log data (simple split by lines)
	log_lines = log_data.splitlines()

	# Slider to select number of n-grams (1 for unigrams, 2 for bigrams, etc.)
	n_value = st.slider("Select number of words in N-grams", min_value=1, max_value=5, value=2)

	# Extract n-grams from each log line based on slider value
	ngrams_from_logs = [extract_ngrams(line, n_value) for line in log_lines]

	# Flatten the list of n-grams and count their frequencies
	all_ngrams = list(chain.from_iterable(ngrams_from_logs))
	ngram_frequencies = Counter(all_ngrams)

	# Convert to DataFrame for viewing and sort by frequency
	ngram_df = pd.DataFrame(ngram_frequencies.items(), columns=['N-gram', 'Frequency'])

	# Dropdown to select top or bottom N n-grams for visualization
	top_bottom_choice = st.selectbox("Select visualization type", ["Top N", "Bottom N"])

	# Slider to select how many N-grams to display
	num_ngrams_to_display = st.slider("Select number of N-grams to display", min_value=1, max_value=50, value=20)

	if top_bottom_choice == "Top N":
	selected_ngrams_df = ngram_df.sort_values(by='Frequency', ascending=False).head(num_ngrams_to_display)
	else:
	selected_ngrams_df = ngram_df.sort_values(by='Frequency').head(num_ngrams_to_display)

	# Display selected N-grams with frequencies
	st.write(f"### {top_bottom_choice} {num_ngrams_to_display} N-grams")
	st.dataframe(selected_ngrams_df)

	# Generate and display a word cloud of selected N-grams
	st.write("### N-gram Frequency Word Cloud")
	wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(
	{row['N-gram']: row['Frequency'] for _, row in selected_ngrams_df.iterrows()}
	)

	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	st.pyplot(plt)

	# Prepare data for heatmap: count occurrences of each selected n-gram per line number
	heatmap_data = pd.DataFrame([(ng, i) for i, ngs in enumerate(ngrams_from_logs) for ng in ngs],
	columns=['N-gram', 'LineNumber'])

	heatmap_data = heatmap_data[heatmap_data['N-gram'].isin(selected_ngrams_df['N-gram'])]

	heatmap_matrix = pd.crosstab(index=heatmap_data['LineNumber'], columns=heatmap_data['N-gram'])

	# Create a heatmap using Plotly
	st.write("### Heatmap of Line Number and Selected N-grams")

	fig = px.imshow(heatmap_matrix.T,
	labels=dict(x="Line Number", y="N-grams", color="Frequency"),
	x=heatmap_matrix.index,
	y=heatmap_matrix.columns,
	aspect="auto",
	color_continuous_scale='YlGnBu')

	fig.update_layout(title='Heatmap of Line Number vs Selected N-grams',
	xaxis_title='Line Number',
	yaxis_title='N-grams')

	st.plotly_chart(fig)

	# Dropdown to filter logs based on selected N-grams
	filter_ngram_choice = st.selectbox("Filter logs by N-gram", options=['None'] + list(selected_ngrams_df['N-gram']))

	if filter_ngram_choice != 'None':
	filtered_logs = [line for line, ngrams in zip(log_lines, ngrams_from_logs) if filter_ngram_choice in ngrams]
	st.write(f"### Logs containing '{filter_ngram_choice}'")
	st.dataframe(pd.DataFrame(filtered_logs, columns=["Log Line"]))

	# Sidebar information
	st.sidebar.title("About")
	st.sidebar.info("""
	This tool uses manual extraction of n-grams from log files to identify trends, anomalies,
	and potential issues through interactive visualization and analysis.
	""")