Spaces:

davidmezzetti
/

analyzestars

Sleeping

App Files Files Community

davidmezzetti commited on Dec 20, 2024

Commit

9f77c49

verified ·

1 Parent(s): 4113c67

Upload app.py

Browse files

Files changed (1) hide show

app.py +157 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+This application enables exploration with data from the paper:
+4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware
+https://arxiv.org/abs/2412.13459
+Requires the following packages
+  pip install streamlit
+"""
+import os
+import pandas as pd
+import streamlit as st
+class Application:
+    """
+    Main application.
+    """
+    def __init__(self):
+        """
+        Creates a new application.
+        """
+        # Load data from GitHub project
+        self.data = self.load()
+    def load(self):
+        """
+        Loads data from the source GitHub project.
+        Returns:
+            dataframe
+        """
+        # Read data
+        version = "241001"
+        clustered = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_clustered_stars_by_month.csv")
+        activity = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_low_activity_stars_by_month.csv")
+        data = pd.merge(clustered, activity, how="outer", on=["repo", "month"])
+        # Remove duplicate stars column
+        data["n_stars"] = pd.to_numeric(data[["n_stars_x", "n_stars_y"]].max(axis=1), downcast="integer")
+        data = data.drop(["n_stars_x", "n_stars_y"], axis=1)
+        # Aggregate fake star counts
+        data["n_stars_clustered"] = pd.to_numeric(data["n_stars_clustered"].fillna(0), downcast="integer")
+        data["n_stars_low_activity"] = pd.to_numeric(data["n_stars_low_activity"].fillna(0), downcast="integer")
+        data["n_stars_flagged"] = data["n_stars_clustered"] + data["n_stars_low_activity"]
+        data["n_stars_flagged"] = pd.to_numeric(data[["n_stars", "n_stars_flagged"]].min(axis=1), downcast="integer")
+        # Calculate stat columns
+        data["n_flagged_percent"] = 100 * (data["n_stars_flagged"] / data["n_stars"])
+        data.columns = ["repo", "month", "clustered", "low activity", "total stars", "flagged stars", "flagged %"]
+        return data[["repo", "month", "clustered", "low activity", "flagged stars", "total stars", "flagged %"]]
+    def run(self):
+        """
+        Main rendering logic.
+        """
+        # List of GitHub repos
+        repos = st.text_area("**GitHub Repos, one per line**")
+        # Format input
+        repos = self.parse(repos)
+        if repos:
+            # Get top result per project
+            frames = []
+            for repo in repos:
+                df = self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("flagged stars", ascending=False)[:1]
+                frames.append(df)
+            # Aggregate into single data frame and display
+            aggregate = pd.concat(frames, axis=0)
+            st.markdown("**Top month flagged by project**")
+            st.dataframe(
+                aggregate.sort_values("flagged %", ascending=False).reset_index(drop=True),
+                column_config={
+                    "flagged %": st.column_config.NumberColumn(
+                        format="%.2f %%"
+                    )
+                },
+                use_container_width=True
+            )
+            for repo in repos:
+                st.markdown(f"**{repo}**")
+                st.line_chart(
+                    data=self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("month"),
+                    x="month",
+                    y=["total stars", "flagged stars"],
+                    color=["#F44336", "#2196F3"],
+                )
+    def parse(self, repos):
+        """
+        Parses and cleans the input repos string.
+        """
+        outputs = []
+        for repo in repos.split("\n"):
+            repo = repo.replace("https://github.com/", "")
+            if repo:
+                outputs.append(repo)
+        return outputs
+@st.cache_resource(show_spinner="Initializing application...")
+def create():
+    """
+    Creates and caches a Streamlit application.
+    Returns:
+        Application
+    """
+    return Application()
+if __name__ == "__main__":
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    st.set_page_config(
+        page_title="4.5 Million (Suspected) Fake Stars in GitHub",
+        page_icon="⭐",
+        layout="centered",
+        initial_sidebar_state="auto",
+        menu_items=None,
+    )
+    st.markdown("## 4.5 Million (Suspected) Fake ⭐'s in GitHub")
+    st.markdown(
+        """
+This application explores the data provided by the paper titled:
+_4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware_
+_[Paper](https://arxiv.org/abs/2412.13459) | [GitHub Project](https://github.com/hehao98/StarScout)_
+Note the disclaimer from the paper's author's.
+**Disclaimer**. _As we discussed in Section 3.4 and 3.5 in our paper, the resulting dataset are only repositories and users with suspected
+fake stars. The individual repositories and users in our dataset may be false positives. The main purpose of our dataset is for statistical
+analyses (which tolerates noises reasonably well), not for publicly shaming individual repositories. If you intend to publish subsequent work
+based on our dataset, please be aware of this limitation and its ethical implications._
+"""
+    )
+    # Create and run application
+    app = create()
+    app.run()