davidmezzetti commited on
Commit
9f77c49
·
verified ·
1 Parent(s): 4113c67

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This application enables exploration with data from the paper:
3
+
4
+ 4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware
5
+ https://arxiv.org/abs/2412.13459
6
+
7
+ Requires the following packages
8
+ pip install streamlit
9
+ """
10
+
11
+ import os
12
+
13
+ import pandas as pd
14
+ import streamlit as st
15
+
16
+
17
+ class Application:
18
+ """
19
+ Main application.
20
+ """
21
+
22
+ def __init__(self):
23
+ """
24
+ Creates a new application.
25
+ """
26
+
27
+ # Load data from GitHub project
28
+ self.data = self.load()
29
+
30
+ def load(self):
31
+ """
32
+ Loads data from the source GitHub project.
33
+
34
+ Returns:
35
+ dataframe
36
+ """
37
+
38
+ # Read data
39
+ version = "241001"
40
+ clustered = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_clustered_stars_by_month.csv")
41
+ activity = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_low_activity_stars_by_month.csv")
42
+ data = pd.merge(clustered, activity, how="outer", on=["repo", "month"])
43
+
44
+ # Remove duplicate stars column
45
+ data["n_stars"] = pd.to_numeric(data[["n_stars_x", "n_stars_y"]].max(axis=1), downcast="integer")
46
+ data = data.drop(["n_stars_x", "n_stars_y"], axis=1)
47
+
48
+ # Aggregate fake star counts
49
+ data["n_stars_clustered"] = pd.to_numeric(data["n_stars_clustered"].fillna(0), downcast="integer")
50
+ data["n_stars_low_activity"] = pd.to_numeric(data["n_stars_low_activity"].fillna(0), downcast="integer")
51
+ data["n_stars_flagged"] = data["n_stars_clustered"] + data["n_stars_low_activity"]
52
+ data["n_stars_flagged"] = pd.to_numeric(data[["n_stars", "n_stars_flagged"]].min(axis=1), downcast="integer")
53
+
54
+ # Calculate stat columns
55
+ data["n_flagged_percent"] = 100 * (data["n_stars_flagged"] / data["n_stars"])
56
+
57
+ data.columns = ["repo", "month", "clustered", "low activity", "total stars", "flagged stars", "flagged %"]
58
+ return data[["repo", "month", "clustered", "low activity", "flagged stars", "total stars", "flagged %"]]
59
+
60
+ def run(self):
61
+ """
62
+ Main rendering logic.
63
+ """
64
+
65
+ # List of GitHub repos
66
+ repos = st.text_area("**GitHub Repos, one per line**")
67
+
68
+ # Format input
69
+ repos = self.parse(repos)
70
+
71
+ if repos:
72
+ # Get top result per project
73
+ frames = []
74
+ for repo in repos:
75
+ df = self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("flagged stars", ascending=False)[:1]
76
+ frames.append(df)
77
+
78
+ # Aggregate into single data frame and display
79
+ aggregate = pd.concat(frames, axis=0)
80
+ st.markdown("**Top month flagged by project**")
81
+ st.dataframe(
82
+ aggregate.sort_values("flagged %", ascending=False).reset_index(drop=True),
83
+ column_config={
84
+ "flagged %": st.column_config.NumberColumn(
85
+ format="%.2f %%"
86
+ )
87
+ },
88
+ use_container_width=True
89
+ )
90
+
91
+ for repo in repos:
92
+ st.markdown(f"**{repo}**")
93
+ st.line_chart(
94
+ data=self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("month"),
95
+ x="month",
96
+ y=["total stars", "flagged stars"],
97
+ color=["#F44336", "#2196F3"],
98
+ )
99
+
100
+ def parse(self, repos):
101
+ """
102
+ Parses and cleans the input repos string.
103
+ """
104
+
105
+ outputs = []
106
+ for repo in repos.split("\n"):
107
+ repo = repo.replace("https://github.com/", "")
108
+ if repo:
109
+ outputs.append(repo)
110
+
111
+ return outputs
112
+
113
+
114
+ @st.cache_resource(show_spinner="Initializing application...")
115
+ def create():
116
+ """
117
+ Creates and caches a Streamlit application.
118
+
119
+ Returns:
120
+ Application
121
+ """
122
+
123
+ return Application()
124
+
125
+
126
+ if __name__ == "__main__":
127
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
128
+
129
+ st.set_page_config(
130
+ page_title="4.5 Million (Suspected) Fake Stars in GitHub",
131
+ page_icon="⭐",
132
+ layout="centered",
133
+ initial_sidebar_state="auto",
134
+ menu_items=None,
135
+ )
136
+ st.markdown("## 4.5 Million (Suspected) Fake ⭐'s in GitHub")
137
+
138
+ st.markdown(
139
+ """
140
+ This application explores the data provided by the paper titled:
141
+
142
+ _4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware_
143
+
144
+ _[Paper](https://arxiv.org/abs/2412.13459) | [GitHub Project](https://github.com/hehao98/StarScout)_
145
+
146
+ Note the disclaimer from the paper's author's.
147
+
148
+ **Disclaimer**. _As we discussed in Section 3.4 and 3.5 in our paper, the resulting dataset are only repositories and users with suspected
149
+ fake stars. The individual repositories and users in our dataset may be false positives. The main purpose of our dataset is for statistical
150
+ analyses (which tolerates noises reasonably well), not for publicly shaming individual repositories. If you intend to publish subsequent work
151
+ based on our dataset, please be aware of this limitation and its ethical implications._
152
+ """
153
+ )
154
+
155
+ # Create and run application
156
+ app = create()
157
+ app.run()