File size: 5,749 Bytes
9f77c49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cf5129
9f77c49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9798049
 
9f77c49
 
9798049
9f77c49
 
 
 
 
 
 
 
9798049
9f77c49
 
 
 
 
 
 
 
 
 
 
6cf5129
 
 
9f77c49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cf5129
9f77c49
 
 
 
 
595da74
6cf5129
595da74
6cf5129
 
9f77c49
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
This application enables exploration with data from the paper:

4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware
https://arxiv.org/abs/2412.13459

Requires the following packages
  pip install streamlit
"""

import os

import pandas as pd
import streamlit as st


class Application:
    """
    Main application.
    """

    def __init__(self):
        """
        Creates a new application.
        """

        # Load data from GitHub project
        self.data = self.load()

    def load(self):
        """
        Loads data from the source GitHub project.

        Returns:
            dataframe
        """

        # Read data
        version = "241001"
        clustered = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_clustered_stars_by_month.csv")
        activity = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_low_activity_stars_by_month.csv")
        data = pd.merge(clustered, activity, how="outer", on=["repo", "month"])

        # Remove duplicate stars column
        data["n_stars"] = pd.to_numeric(data[["n_stars_x", "n_stars_y"]].max(axis=1), downcast="integer")
        data = data.drop(["n_stars_x", "n_stars_y"], axis=1)

        # Aggregate fake star counts
        data["n_stars_clustered"] = pd.to_numeric(data["n_stars_clustered"].fillna(0), downcast="integer")
        data["n_stars_low_activity"] = pd.to_numeric(data["n_stars_low_activity"].fillna(0), downcast="integer")
        data["n_stars_flagged"] = data["n_stars_clustered"] + data["n_stars_low_activity"]
        data["n_stars_flagged"] = pd.to_numeric(data[["n_stars", "n_stars_flagged"]].min(axis=1), downcast="integer")

        # Calculate stat columns
        data["n_flagged_percent"] = 100 * (data["n_stars_flagged"] / data["n_stars"])

        # Rename and organize columns
        data.columns = ["repo", "month", "clustered", "low activity", "total stars", "flagged stars", "flagged %"]
        return data[["repo", "month", "clustered", "low activity", "flagged stars", "total stars", "flagged %"]]

    def run(self):
        """
        Main rendering logic.
        """

        # List of GitHub repos
        repos = st.text_area("**GitHub Repos, one per line**")

        # Format input
        repos = self.parse(repos)

        if repos:
            # Get top result per project
            frames = []
            for repo in repos:
                df = self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("flagged stars", ascending=False)[:1]
                frames.append(df)

            # Aggregate into single data frame and display
            aggregate = pd.concat(frames, axis=0)
            aggregate = aggregate.sort_values("flagged %", ascending=False).reset_index(drop=True)
            
            st.markdown("**Top month flagged by project**")
            st.dataframe(
                data=aggregate,
                column_config={
                    "flagged %": st.column_config.NumberColumn(
                        format="%.2f %%"
                    )
                },
                use_container_width=True
            )

            for repo in aggregate["repo"]:
                st.markdown(f"**{repo}**")
                st.line_chart(
                    data=self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("month"),
                    x="month",
                    y=["total stars", "flagged stars"],
                    color=["#F44336", "#2196F3"],
                )

    def parse(self, repos):
        """
        Parses and cleans the input repos string.

        Returns:
            list of repos
        """

        outputs = []
        for repo in repos.split("\n"):
            repo = repo.replace("https://github.com/", "")
            if repo:
                outputs.append(repo)

        return outputs


@st.cache_resource(show_spinner="Initializing application...")
def create():
    """
    Creates and caches a Streamlit application.

    Returns:
        Application
    """

    return Application()


if __name__ == "__main__":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    st.set_page_config(
        page_title="4.5 Million (Suspected) Fake Stars in GitHub",
        page_icon="⭐",
        layout="centered",
        initial_sidebar_state="auto",
        menu_items=None,
    )
    st.markdown("## 4.5 Million (Suspected) Fake ⭐'s in GitHub")

    st.markdown(
        """
This application explores the data provided by the paper titled: 

_4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware_

_[Paper](https://arxiv.org/abs/2412.13459) | [GitHub Project](https://github.com/hehao98/StarScout)_

Note the disclaimer from the paper's authors.

**Disclaimer**. _As we discussed in Section 3.4 and 3.5 in our paper, the resulting dataset are only repositories and users with suspected
fake stars. The individual repositories and users in our dataset may be false positives. The main purpose of our dataset is for statistical
analyses (which tolerates noises reasonably well), not for publicly shaming individual repositories. If you intend to publish subsequent work
based on our dataset, please be aware of this limitation and its ethical implications._

To add to the authors disclaimer.

_It's also worth noting that projects that trend on popular sites such as the GitHub Trending Page can attract a lot of automated behavior outside
of a project's control. This dataset is just a data point that shouldn't be used in a vacuum._ 
"""
    )

    # Create and run application
    app = create()
    app.run()