Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from datetime import datetime | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import word_tokenize | |
from gensim.models import LdaModel | |
from gensim.corpora import Dictionary | |
from textblob import TextBlob | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
import networkx as nx | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import plotly.graph_objects as go | |
from collections import Counter | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Set up logging | |
import logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Function to fetch HTML content from GitHub issue pages | |
def fetch_issue_data(username, repository, start_page, end_page): | |
issues_data = [] | |
for page in range(start_page, end_page + 1): | |
url = f"https://github.com/{username}/{repository}/issues?page={page}" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
issue_elements = soup.find_all('div', class_='flex-shrink-0') | |
for issue_element in issue_elements: | |
issue_link = issue_element.find('a', class_='Link--primary')['href'] | |
issue_url = f"https://github.com{issue_link}" | |
issue_data = fetch_issue_details(issue_url) | |
issues_data.append(issue_data) | |
return issues_data | |
# Function to fetch details of a specific issue | |
def fetch_issue_details(issue_url): | |
response = requests.get(issue_url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
issue_title = soup.find('h1', class_='gh-header-title').text.strip() | |
issue_body = soup.find('div', class_='markdown-body').text.strip() | |
issue_created_at = soup.find('relative-time')['datetime'] | |
issue_closed_at = soup.find('relative-time', class_='no-wrap') | |
if issue_closed_at: | |
issue_closed_at = issue_closed_at['datetime'] | |
else: | |
issue_closed_at = None | |
issue_author = soup.find('a', class_='author').text.strip() | |
issue_assignee = soup.find('a', class_='Link--muted') | |
if issue_assignee: | |
issue_assignee = issue_assignee.text.strip() | |
else: | |
issue_assignee = None | |
return { | |
'title': issue_title, | |
'body': issue_body, | |
'created_at': issue_created_at, | |
'closed_at': issue_closed_at, | |
'author': issue_author, | |
'assignee': issue_assignee | |
} | |
# Function to clean and structure the data | |
def clean_and_structure_data(issues_data): | |
df = pd.DataFrame(issues_data) | |
df['created_at'] = pd.to_datetime(df['created_at']) | |
df['closed_at'] = pd.to_datetime(df['closed_at']) | |
df['resolution_time'] = (df['closed_at'] - df['created_at']).dt.days | |
df['resolution_time'] = df['resolution_time'].fillna(-1) | |
df['is_closed'] = (df['closed_at'].notna()).astype(int) | |
return df | |
# Function for exploratory data analysis (EDA) | |
def perform_eda(df): | |
# Descriptive statistics | |
print("Descriptive Statistics:") | |
print(df.describe()) | |
# Visualizations | |
plt.figure(figsize=(10, 6)) | |
sns.histplot(df['resolution_time'], kde=True) | |
plt.title('Distribution of Issue Resolution Time') | |
plt.xlabel('Resolution Time (Days)') | |
plt.ylabel('Frequency') | |
plt.show() | |
# Trend analysis | |
df['created_at_month'] = df['created_at'].dt.month | |
plt.figure(figsize=(10, 6)) | |
sns.lineplot(x='created_at_month', y='resolution_time', data=df) | |
plt.title('Trend of Issue Resolution Time Over Months') | |
plt.xlabel('Month') | |
plt.ylabel('Resolution Time (Days)') | |
plt.show() | |
# Top Authors and Assignees | |
top_authors = df['author'].value_counts().nlargest(10) | |
top_assignees = df['assignee'].value_counts().nlargest(10) | |
print("\nTop 10 Authors:") | |
print(top_authors) | |
print("\nTop 10 Assignees:") | |
print(top_assignees) | |
# Function for text analysis using NLP | |
def analyze_text_content(df): | |
# Text preprocessing | |
stop_words = set(stopwords.words('english')) | |
lemmatizer = WordNetLemmatizer() | |
df['processed_body'] = df['body'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word.lower() not in stop_words])) | |
# Topic modeling | |
dictionary = Dictionary([word_tokenize(text) for text in df['processed_body']]) | |
corpus = [dictionary.doc2bow(word_tokenize(text)) for text in df['processed_body']] | |
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary) | |
print("Top 5 Topics:") | |
for topic in lda_model.print_topics(num_words=5): | |
print(topic) | |
# Sentiment analysis | |
analyzer = SentimentIntensityAnalyzer() | |
df['sentiment'] = df['body'].apply(lambda text: analyzer.polarity_scores(text)['compound']) | |
print("Sentiment Analysis:") | |
print(df['sentiment'].describe()) | |
# Word Cloud for Common Words | |
from wordcloud import WordCloud | |
all_words = ' '.join([text for text in df['processed_body']]) | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words) | |
plt.figure(figsize=(10, 6), facecolor=None) | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
plt.tight_layout(pad=0) | |
plt.show() | |
# Function to create a network graph of issues, authors, and assignees | |
def create_network_graph(df): | |
graph = nx.Graph() | |
for index, row in df.iterrows(): | |
graph.add_node(row['title'], type='issue') | |
graph.add_node(row['author'], type='author') | |
if row['assignee']: | |
graph.add_node(row['assignee'], type='assignee') | |
graph.add_edge(row['title'], row['author']) | |
if row['assignee']: | |
graph.add_edge(row['title'], row['assignee']) | |
# Interactive Network Graph with Plotly | |
pos = nx.spring_layout(graph, k=0.5) | |
edge_x = [] | |
edge_y = [] | |
for edge in graph.edges(): | |
x0, y0 = pos[edge[0]] | |
x1, y1 = pos[edge[1]] | |
edge_x.append([x0, x1, None]) | |
edge_y.append([y0, y1, None]) | |
edge_trace = go.Scatter( | |
x=edge_x, | |
y=edge_y, | |
line=dict(width=0.5, color='#888'), | |
hoverinfo='none', | |
mode='lines' | |
) | |
node_x = [] | |
node_y = [] | |
for node in graph.nodes(): | |
x, y = pos[node] | |
node_x.append(x) | |
node_y.append(y) | |
node_trace = go.Scatter( | |
x=node_x, | |
y=node_y, | |
mode='markers', | |
marker=dict( | |
color=[], | |
size=10, | |
line=dict(width=2, color='black') | |
), | |
text=[], | |
hoverinfo='text' | |
) | |
# Set node colors based on type | |
node_colors = [] | |
for node in graph.nodes(): | |
if graph.nodes[node]['type'] == 'issue': | |
node_colors.append('red') | |
elif graph.nodes[node]['type'] == 'author': | |
node_colors.append('blue') | |
else: | |
node_colors.append('green') | |
# Set node labels | |
node_labels = [] | |
for node in graph.nodes(): | |
node_labels.append(node) | |
node_trace.marker.color = node_colors | |
node_trace.text = node_labels | |
# Create the figure | |
fig = go.Figure(data=[edge_trace, node_trace], | |
layout=go.Layout( | |
title="GitHub Issue Network Graph", | |
showlegend=False, | |
hovermode='closest', | |
margin=dict(b=20, l=5, r=5, t=40), | |
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), | |
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False) | |
) | |
) | |
fig.show() | |
# Function to build a predictive model for issue resolution time | |
def build_predictive_model(df): | |
# Feature engineering | |
df['created_at_day'] = df['created_at'].dt.day | |
df['created_at_weekday'] = df['created_at'].dt.weekday | |
df['created_at_hour'] = df['created_at'].dt.hour | |
df['author_encoded'] = df['author'].astype('category').cat.codes | |
df['assignee_encoded'] = df['assignee'].astype('category').cat.codes | |
# Select features and target variable | |
features = ['created_at_day', 'created_at_weekday', 'created_at_hour', 'author_encoded', 'assignee_encoded', 'sentiment'] | |
target = 'resolution_time' | |
# Split data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42) | |
# Create a pipeline for feature scaling and model training | |
pipeline = Pipeline([ | |
('scaler', StandardScaler()), | |
('model', RandomForestClassifier(random_state=42)) | |
]) | |
# Train the model | |
pipeline.fit(X_train, y_train) | |
# Evaluate the model | |
y_pred = pipeline.predict(X_test) | |
accuracy = accuracy_score(y_test, y_pred) | |
print("Accuracy:", accuracy) | |
print(classification_report(y_test, y_pred)) | |
# Make predictions on new data | |
# ... | |
# Main function | |
if __name__ == "__main__": | |
# Replace with your GitHub username and repository name | |
username = "miagiii" | |
repository = "miagiii" | |
# Fetch issue data from GitHub | |
issues_data = fetch_issue_data(username, repository, 1, 10) # Fetch issues from pages 1 to 10 | |
# Clean and structure the data | |
df = clean_and_structure_data(issues_data) | |
# Perform exploratory data analysis (EDA) | |
perform_eda(df) | |
# Analyze text content using NLP | |
analyze_text_content(df) | |
# Create a network graph of issues, authors, and assignees | |
create_network_graph(df) | |
# Build a predictive model for issue resolution time | |
build_predictive_model(df) |