MistriDevLab

Sleeping

App Files Files Community

MistriDevLab / app.py

acecalisto3

Update app.py

118329d verified 9 months ago

raw

history blame

9.99 kB

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datetime import datetime
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize
	from gensim.models import LdaModel
	from gensim.corpora import Dictionary
	from textblob import TextBlob
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	import networkx as nx
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
	from sklearn.preprocessing import StandardScaler
	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import TfidfVectorizer
	import plotly.graph_objects as go
	from collections import Counter
	import warnings
	warnings.filterwarnings("ignore")

	# Set up logging
	import logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# Function to fetch HTML content from GitHub issue pages
	def fetch_issue_data(username, repository, start_page, end_page):
	issues_data = []
	for page in range(start_page, end_page + 1):
	url = f"https://github.com/{username}/{repository}/issues?page={page}"
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')
	issue_elements = soup.find_all('div', class_='flex-shrink-0')
	for issue_element in issue_elements:
	issue_link = issue_element.find('a', class_='Link--primary')['href']
	issue_url = f"https://github.com{issue_link}"
	issue_data = fetch_issue_details(issue_url)
	issues_data.append(issue_data)
	return issues_data

	# Function to fetch details of a specific issue
	def fetch_issue_details(issue_url):
	response = requests.get(issue_url)
	soup = BeautifulSoup(response.content, 'html.parser')
	issue_title = soup.find('h1', class_='gh-header-title').text.strip()
	issue_body = soup.find('div', class_='markdown-body').text.strip()
	issue_created_at = soup.find('relative-time')['datetime']
	issue_closed_at = soup.find('relative-time', class_='no-wrap')
	if issue_closed_at:
	issue_closed_at = issue_closed_at['datetime']
	else:
	issue_closed_at = None
	issue_author = soup.find('a', class_='author').text.strip()
	issue_assignee = soup.find('a', class_='Link--muted')
	if issue_assignee:
	issue_assignee = issue_assignee.text.strip()
	else:
	issue_assignee = None
	return {
	'title': issue_title,
	'body': issue_body,
	'created_at': issue_created_at,
	'closed_at': issue_closed_at,
	'author': issue_author,
	'assignee': issue_assignee
	}

	# Function to clean and structure the data
	def clean_and_structure_data(issues_data):
	df = pd.DataFrame(issues_data)
	df['created_at'] = pd.to_datetime(df['created_at'])
	df['closed_at'] = pd.to_datetime(df['closed_at'])
	df['resolution_time'] = (df['closed_at'] - df['created_at']).dt.days
	df['resolution_time'] = df['resolution_time'].fillna(-1)
	df['is_closed'] = (df['closed_at'].notna()).astype(int)
	return df

	# Function for exploratory data analysis (EDA)
	def perform_eda(df):
	# Descriptive statistics
	print("Descriptive Statistics:")
	print(df.describe())

	# Visualizations
	plt.figure(figsize=(10, 6))
	sns.histplot(df['resolution_time'], kde=True)
	plt.title('Distribution of Issue Resolution Time')
	plt.xlabel('Resolution Time (Days)')
	plt.ylabel('Frequency')
	plt.show()

	# Trend analysis
	df['created_at_month'] = df['created_at'].dt.month
	plt.figure(figsize=(10, 6))
	sns.lineplot(x='created_at_month', y='resolution_time', data=df)
	plt.title('Trend of Issue Resolution Time Over Months')
	plt.xlabel('Month')
	plt.ylabel('Resolution Time (Days)')
	plt.show()

	# Top Authors and Assignees
	top_authors = df['author'].value_counts().nlargest(10)
	top_assignees = df['assignee'].value_counts().nlargest(10)
	print("\nTop 10 Authors:")
	print(top_authors)
	print("\nTop 10 Assignees:")
	print(top_assignees)

	# Function for text analysis using NLP
	def analyze_text_content(df):
	# Text preprocessing
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()
	df['processed_body'] = df['body'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word.lower() not in stop_words]))

	# Topic modeling
	dictionary = Dictionary([word_tokenize(text) for text in df['processed_body']])
	corpus = [dictionary.doc2bow(word_tokenize(text)) for text in df['processed_body']]
	lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary)
	print("Top 5 Topics:")
	for topic in lda_model.print_topics(num_words=5):
	print(topic)

	# Sentiment analysis
	analyzer = SentimentIntensityAnalyzer()
	df['sentiment'] = df['body'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
	print("Sentiment Analysis:")
	print(df['sentiment'].describe())

	# Word Cloud for Common Words
	from wordcloud import WordCloud
	all_words = ' '.join([text for text in df['processed_body']])
	wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
	plt.figure(figsize=(10, 6), facecolor=None)
	plt.imshow(wordcloud)
	plt.axis("off")
	plt.tight_layout(pad=0)
	plt.show()

	# Function to create a network graph of issues, authors, and assignees
	def create_network_graph(df):
	graph = nx.Graph()
	for index, row in df.iterrows():
	graph.add_node(row['title'], type='issue')
	graph.add_node(row['author'], type='author')
	if row['assignee']:
	graph.add_node(row['assignee'], type='assignee')
	graph.add_edge(row['title'], row['author'])
	if row['assignee']:
	graph.add_edge(row['title'], row['assignee'])

	# Interactive Network Graph with Plotly
	pos = nx.spring_layout(graph, k=0.5)
	edge_x = []
	edge_y = []
	for edge in graph.edges():
	x0, y0 = pos[edge[0]]
	x1, y1 = pos[edge[1]]
	edge_x.append([x0, x1, None])
	edge_y.append([y0, y1, None])

	edge_trace = go.Scatter(
	x=edge_x,
	y=edge_y,
	line=dict(width=0.5, color='#888'),
	hoverinfo='none',
	mode='lines'
	)

	node_x = []
	node_y = []
	for node in graph.nodes():
	x, y = pos[node]
	node_x.append(x)
	node_y.append(y)

	node_trace = go.Scatter(
	x=node_x,
	y=node_y,
	mode='markers',
	marker=dict(
	color=[],
	size=10,
	line=dict(width=2, color='black')
	),
	text=[],
	hoverinfo='text'
	)

	# Set node colors based on type
	node_colors = []
	for node in graph.nodes():
	if graph.nodes[node]['type'] == 'issue':
	node_colors.append('red')
	elif graph.nodes[node]['type'] == 'author':
	node_colors.append('blue')
	else:
	node_colors.append('green')

	# Set node labels
	node_labels = []
	for node in graph.nodes():
	node_labels.append(node)

	node_trace.marker.color = node_colors
	node_trace.text = node_labels

	# Create the figure
	fig = go.Figure(data=[edge_trace, node_trace],
	layout=go.Layout(
	title="GitHub Issue Network Graph",
	showlegend=False,
	hovermode='closest',
	margin=dict(b=20, l=5, r=5, t=40),
	xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
	yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
	)
	)

	fig.show()

	# Function to build a predictive model for issue resolution time
	def build_predictive_model(df):
	# Feature engineering
	df['created_at_day'] = df['created_at'].dt.day
	df['created_at_weekday'] = df['created_at'].dt.weekday
	df['created_at_hour'] = df['created_at'].dt.hour
	df['author_encoded'] = df['author'].astype('category').cat.codes
	df['assignee_encoded'] = df['assignee'].astype('category').cat.codes

	# Select features and target variable
	features = ['created_at_day', 'created_at_weekday', 'created_at_hour', 'author_encoded', 'assignee_encoded', 'sentiment']
	target = 'resolution_time'

	# Split data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

	# Create a pipeline for feature scaling and model training
	pipeline = Pipeline([
	('scaler', StandardScaler()),
	('model', RandomForestClassifier(random_state=42))
	])

	# Train the model
	pipeline.fit(X_train, y_train)

	# Evaluate the model
	y_pred = pipeline.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	print("Accuracy:", accuracy)
	print(classification_report(y_test, y_pred))

	# Make predictions on new data
	# ...

	# Main function
	if __name__ == "__main__":
	# Replace with your GitHub username and repository name
	username = "miagiii"
	repository = "miagiii"

	# Fetch issue data from GitHub
	issues_data = fetch_issue_data(username, repository, 1, 10) # Fetch issues from pages 1 to 10

	# Clean and structure the data
	df = clean_and_structure_data(issues_data)

	# Perform exploratory data analysis (EDA)
	perform_eda(df)

	# Analyze text content using NLP
	analyze_text_content(df)

	# Create a network graph of issues, authors, and assignees
	create_network_graph(df)

	# Build a predictive model for issue resolution time
	build_predictive_model(df)