Spaces:
Sleeping
Sleeping
File size: 5,749 Bytes
9f77c49 6cf5129 9f77c49 9798049 9f77c49 9798049 9f77c49 9798049 9f77c49 6cf5129 9f77c49 6cf5129 9f77c49 595da74 6cf5129 595da74 6cf5129 9f77c49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
"""
This application enables exploration with data from the paper:
4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware
https://arxiv.org/abs/2412.13459
Requires the following packages
pip install streamlit
"""
import os
import pandas as pd
import streamlit as st
class Application:
"""
Main application.
"""
def __init__(self):
"""
Creates a new application.
"""
# Load data from GitHub project
self.data = self.load()
def load(self):
"""
Loads data from the source GitHub project.
Returns:
dataframe
"""
# Read data
version = "241001"
clustered = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_clustered_stars_by_month.csv")
activity = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_low_activity_stars_by_month.csv")
data = pd.merge(clustered, activity, how="outer", on=["repo", "month"])
# Remove duplicate stars column
data["n_stars"] = pd.to_numeric(data[["n_stars_x", "n_stars_y"]].max(axis=1), downcast="integer")
data = data.drop(["n_stars_x", "n_stars_y"], axis=1)
# Aggregate fake star counts
data["n_stars_clustered"] = pd.to_numeric(data["n_stars_clustered"].fillna(0), downcast="integer")
data["n_stars_low_activity"] = pd.to_numeric(data["n_stars_low_activity"].fillna(0), downcast="integer")
data["n_stars_flagged"] = data["n_stars_clustered"] + data["n_stars_low_activity"]
data["n_stars_flagged"] = pd.to_numeric(data[["n_stars", "n_stars_flagged"]].min(axis=1), downcast="integer")
# Calculate stat columns
data["n_flagged_percent"] = 100 * (data["n_stars_flagged"] / data["n_stars"])
# Rename and organize columns
data.columns = ["repo", "month", "clustered", "low activity", "total stars", "flagged stars", "flagged %"]
return data[["repo", "month", "clustered", "low activity", "flagged stars", "total stars", "flagged %"]]
def run(self):
"""
Main rendering logic.
"""
# List of GitHub repos
repos = st.text_area("**GitHub Repos, one per line**")
# Format input
repos = self.parse(repos)
if repos:
# Get top result per project
frames = []
for repo in repos:
df = self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("flagged stars", ascending=False)[:1]
frames.append(df)
# Aggregate into single data frame and display
aggregate = pd.concat(frames, axis=0)
aggregate = aggregate.sort_values("flagged %", ascending=False).reset_index(drop=True)
st.markdown("**Top month flagged by project**")
st.dataframe(
data=aggregate,
column_config={
"flagged %": st.column_config.NumberColumn(
format="%.2f %%"
)
},
use_container_width=True
)
for repo in aggregate["repo"]:
st.markdown(f"**{repo}**")
st.line_chart(
data=self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("month"),
x="month",
y=["total stars", "flagged stars"],
color=["#F44336", "#2196F3"],
)
def parse(self, repos):
"""
Parses and cleans the input repos string.
Returns:
list of repos
"""
outputs = []
for repo in repos.split("\n"):
repo = repo.replace("https://github.com/", "")
if repo:
outputs.append(repo)
return outputs
@st.cache_resource(show_spinner="Initializing application...")
def create():
"""
Creates and caches a Streamlit application.
Returns:
Application
"""
return Application()
if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false"
st.set_page_config(
page_title="4.5 Million (Suspected) Fake Stars in GitHub",
page_icon="⭐",
layout="centered",
initial_sidebar_state="auto",
menu_items=None,
)
st.markdown("## 4.5 Million (Suspected) Fake ⭐'s in GitHub")
st.markdown(
"""
This application explores the data provided by the paper titled:
_4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware_
_[Paper](https://arxiv.org/abs/2412.13459) | [GitHub Project](https://github.com/hehao98/StarScout)_
Note the disclaimer from the paper's authors.
**Disclaimer**. _As we discussed in Section 3.4 and 3.5 in our paper, the resulting dataset are only repositories and users with suspected
fake stars. The individual repositories and users in our dataset may be false positives. The main purpose of our dataset is for statistical
analyses (which tolerates noises reasonably well), not for publicly shaming individual repositories. If you intend to publish subsequent work
based on our dataset, please be aware of this limitation and its ethical implications._
To add to the authors disclaimer.
_It's also worth noting that projects that trend on popular sites such as the GitHub Trending Page can attract a lot of automated behavior outside
of a project's control. This dataset is just a data point that shouldn't be used in a vacuum._
"""
)
# Create and run application
app = create()
app.run()
|