Spaces:

gretelai
/

dataset-card-generator

Running

App Files Files Community

Alexander Watson commited on Nov 11, 2024

Commit

eb03925

1 Parent(s): 28fb096

initial checkin

Browse files

Files changed (9) hide show

.gitignore +42 -0
LICENSE +201 -0
README.md +1 -14
app.py +11 -0
requirements.txt +14 -0
src/app.py +299 -0
src/utils/__init__.py +0 -0
src/utils/analysis.py +486 -0
src/utils/visualization.py +162 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,42 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# IDEs
+.idea/
+.vscode/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Streamlit
+.streamlit/secrets.toml
+# Local development
+.env

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,14 +1 @@
----
-title: Dataset Card Generator
-emoji: 🦀
-colorFrom: green
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.40.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Generate beautiful documentation for your HF datasets
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # data-card-generator

app.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import sys
+from pathlib import Path
+# Add src directory to Python path
+src_path = Path(__file__).parent / "src"
+sys.path.append(str(src_path))
+# Import and run the actual app
+from app import main
+main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+streamlit==1.31.1
+pandas==2.2.0
+matplotlib==3.8.2
+seaborn==0.13.2
+datasets==2.17.0
+huggingface-hub==0.20.3
+wordcloud==1.9.3
+PyYAML==6.0.1
+openai==1.12.0
+python-dotenv==1.0.1
+plotly==5.18.0
+kaleido==0.2.1
+scipy==1.12.0
+tiktoken==0.7.0

src/app.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import json
+import pandas as pd
+import streamlit as st
+from datasets import load_dataset
+from huggingface_hub import HfApi, login
+from openai import OpenAI
+# Import our utility functions
+from utils.analysis import analyze_dataset_with_openai, generate_dataset_card
+from utils.visualization import create_distribution_plot, create_wordcloud
+# Initialize session state variables
+if "openai_analysis" not in st.session_state:
+    st.session_state.openai_analysis = None
+if "df" not in st.session_state:
+    st.session_state.df = None
+if "dataset_name" not in st.session_state:
+    st.session_state.dataset_name = None
+if "selected_dist_columns" not in st.session_state:
+    st.session_state.selected_dist_columns = []
+if "selected_wordcloud_columns" not in st.session_state:
+    st.session_state.selected_wordcloud_columns = []
+st.set_page_config(
+    page_title="Dataset Card Generator",
+    page_icon="📊",
+    layout="wide",
+)
+def initialize_openai_client(api_key):
+    """Initialize OpenAI client with API key."""
+    return OpenAI(api_key=api_key)
+def load_and_analyze_dataset(dataset_name):
+    """Load dataset and perform initial analysis."""
+    progress_container = st.empty()
+    with progress_container.container():
+        with st.status("Loading dataset...", expanded=True) as status:
+            try:
+                # Load dataset
+                status.write("📥 Loading dataset from HuggingFace...")
+                dataset = load_dataset(dataset_name, split="train")
+                df = pd.DataFrame(dataset)
+                st.session_state.df = df
+                st.session_state.dataset_name = dataset_name
+                # Initialize OpenAI analysis
+                try:
+                    status.write("🤖 Analyzing dataset ...")
+                    client = initialize_openai_client(st.session_state.openai_key)
+                    sample_data = dataset[:5]
+                    print("Sample data:", json.dumps(sample_data, indent=2))
+                    analysis = analyze_dataset_with_openai(client, sample_data)
+                    print("Analysis result:", json.dumps(analysis, indent=2))
+                    st.session_state.openai_analysis = analysis
+                except Exception as e:
+                    print(f"Analysis error: {str(e)}")
+                    status.update(label=f"❌ Error: {str(e)}", state="error")
+                status.update(
+                    label="✅ Dataset loaded and analyzed successfully!",
+                    state="complete",
+                )
+            except Exception as e:
+                status.update(label=f"❌ Error: {str(e)}", state="error")
+                st.error(f"Failed to load dataset: {str(e)}")
+                return
+def display_dataset_analysis():
+    """Display dataset analysis and visualization options."""
+    if st.session_state.df is None:
+        return
+    st.header("Dataset Analysis")
+    # Dataset preview
+    with st.expander("📊 Dataset Preview", expanded=True):
+        st.dataframe(st.session_state.df.head(), use_container_width=True)
+    # Column selection for visualizations
+    st.subheader("Select Visualization Fields")
+    col1, col2 = st.columns(2)
+    with col1:
+        # Distribution plot selection
+        st.session_state.selected_dist_columns = st.multiselect(
+            "Distribution Plots (max 2)",
+            options=st.session_state.df.columns.tolist(),
+            format_func=lambda x: get_column_type_description(st.session_state.df, x),
+            max_selections=2,
+            help="Select columns to show value distributions. List columns will show frequency of individual items.",
+        )
+    with col2:
+        # Word cloud selection
+        text_columns = [
+            col
+            for col in st.session_state.df.columns
+            if st.session_state.df[col].dtype == "object"
+            or isinstance(st.session_state.df[col].iloc[0], list)
+        ]
+        st.session_state.selected_wordcloud_columns = st.multiselect(
+            "Word Clouds (max 2)",
+            options=text_columns,
+            format_func=lambda x: get_column_type_description(st.session_state.df, x),
+            max_selections=2,
+            help="Select text columns to generate word clouds",
+        )
+    # Add some spacing
+    st.markdown("---")
+    # Generate card button
+    if st.button("Generate Dataset Card", type="primary", use_container_width=True):
+        if not (
+            st.session_state.selected_dist_columns
+            or st.session_state.selected_wordcloud_columns
+        ):
+            st.warning(
+                "Please select at least one visualization before generating the card."
+            )
+            return
+        generate_and_display_card()
+def generate_and_display_card():
+    """Generate and display the dataset card with visualizations."""
+    if not st.session_state.openai_analysis:
+        st.error(
+            "Dataset analysis not available. Please try loading the dataset again."
+        )
+        return
+    with st.status("Generating dataset card...", expanded=True) as status:
+        try:
+            # Create visualizations
+            status.write("📊 Creating distribution plots...")
+            distribution_plots = {}
+            for col in st.session_state.selected_dist_columns:
+                print(f"Generating distribution plot for {col}")
+                img_base64 = create_distribution_plot(st.session_state.df, col)
+                distribution_plots[col] = img_base64
+                print(f"Successfully created plot for {col}")
+            status.write("🔤 Generating word clouds...")
+            wordcloud_plots = {}
+            for col in st.session_state.selected_wordcloud_columns:
+                print(f"Generating word cloud for {col}")
+                img_base64 = create_wordcloud(st.session_state.df, col)
+                wordcloud_plots[col] = img_base64
+                print(f"Successfully created word cloud for {col}")
+            # Generate dataset card content
+            status.write("📝 Composing dataset card...")
+            dataset_info = {"dataset_name": st.session_state.dataset_name}
+            readme_content = generate_dataset_card(
+                dataset_info=dataset_info,
+                distribution_plots=distribution_plots,
+                wordcloud_plots=wordcloud_plots,
+                openai_analysis=st.session_state.openai_analysis,
+                df=st.session_state.df,  # Added DataFrame parameter
+            )
+            # Display results
+            status.update(label="✅ Dataset card generated!", state="complete")
+            # Display the markdown with images
+            st.markdown(readme_content, unsafe_allow_html=True)
+            # Add download button
+            st.download_button(
+                label="⬇️ Download Dataset Card",
+                data=readme_content,
+                file_name="README.md",
+                mime="text/markdown",
+                use_container_width=True,
+            )
+        except Exception as e:
+            print(f"Error in generate_and_display_card: {str(e)}")
+            st.error(f"Error generating dataset card: {str(e)}")
+            raise e
+def get_column_type_description(data, column):
+    """Get a user-friendly description of the column type."""
+    try:
+        if isinstance(data[column].iloc[0], list):
+            return f"{column} (list)"
+        elif data[column].dtype in ["int64", "float64"]:
+            return f"{column} (numeric)"
+        else:
+            return f"{column} (text/categorical)"
+    except:
+        return f"{column} (unknown)"
+def get_api_keys():
+    """Get API keys from secrets or user input."""
+    # Try to get from secrets first
+    try:
+        hf_token = st.secrets["api_keys"]["huggingface"]
+        openai_key = st.secrets["api_keys"]["openai"]
+        return hf_token, openai_key
+    except:
+        return None, None
+def get_secrets():
+    """Get API keys from secrets.toml if it exists."""
+    try:
+        hf_token = st.secrets.get("api_keys", {}).get("huggingface", "")
+        openai_key = st.secrets.get("api_keys", {}).get("openai", "")
+        return hf_token, openai_key
+    except Exception as e:
+        print(f"No secrets file found or error reading secrets: {e}")
+        return "", ""
+def main():
+    st.title("📊 Dataset Card Generator")
+    st.markdown(
+        """
+    Generate beautiful documentation for your HuggingFace datasets with automated analysis,
+    visualizations, and formatted dataset cards.
+    """
+    )
+    # Get secrets if available
+    default_hf_token, default_openai_key = get_api_keys()
+    # Authentication section in sidebar
+    with st.sidebar:
+        st.header("🔑 Authentication")
+        # OpenAI API key (required)
+        openai_key = st.text_input(
+            "OpenAI API Key",
+            value=default_openai_key,
+            type="password" if not default_openai_key else "default",
+            help="Required: Your OpenAI API key for dataset analysis",
+        )
+        # HuggingFace token (optional)
+        hf_token = st.text_input(
+            "HuggingFace Token (optional)",
+            value=default_hf_token,
+            type="password" if not default_hf_token else "default",
+            help="Optional: Only required for private datasets",
+        )
+        if openai_key:
+            try:
+                # Only attempt HF login if token is provided
+                if hf_token:
+                    login(hf_token)
+                    st.success("✅ HuggingFace authentication successful!")
+                st.session_state.openai_key = openai_key
+                st.success("✅ OpenAI API key set!")
+            except Exception as e:
+                st.error(f"❌ Authentication error: {str(e)}")
+                return
+        else:
+            st.info("👆 Please enter your OpenAI API key to get started.")
+            return
+    # Main content area
+    if not openai_key:
+        return
+    dataset_name = st.text_input(
+        "Enter HuggingFace Dataset Name",
+        placeholder="username/dataset",
+        help="Enter the full path to your HuggingFace dataset (e.g., 'username/dataset')",
+    )
+    if dataset_name:
+        if st.button("Load Dataset", type="primary"):
+            load_and_analyze_dataset(dataset_name)
+    if st.session_state.df is not None:
+        display_dataset_analysis()
+if __name__ == "__main__":
+    main()

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/analysis.py ADDED Viewed

	@@ -0,0 +1,486 @@

+from openai import OpenAI
+import json
+import yaml
+import re
+import datetime
+import plotly.express as px
+import plotly.graph_objects as go
+import pandas as pd
+import base64
+import io
+from collections import Counter
+import tiktoken
+def extract_json_from_response(text: str) -> str:
+    """Extract JSON from a response that might contain markdown code blocks."""
+    # Try to find JSON within code blocks first
+    json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
+    if json_match:
+        return json_match.group(1)
+    # If no code blocks, try to find raw JSON
+    json_match = re.search(r"\{.*\}", text, re.DOTALL)
+    if json_match:
+        return json_match.group(0)
+    # If no JSON found, return the original text
+    return text
+def count_tokens(text: str, model: str = "gpt-4") -> int:
+    """Count tokens in text using tiktoken."""
+    try:
+        encoder = tiktoken.encoding_for_model(model)
+        return len(encoder.encode(str(text)))
+    except Exception as e:
+        print(f"Error counting tokens: {e}")
+        return 0
+def create_distribution_plot(data, column):
+    """Create a distribution plot using Plotly and convert to image."""
+    try:
+        # Check if the column contains lists
+        if isinstance(data[column].iloc[0], list):
+            print(f"Processing list column: {column}")
+            value_counts = flatten_list_column(data, column)
+            fig = go.Figure(
+                [
+                    go.Bar(
+                        x=value_counts.index,
+                        y=value_counts.values,
+                        marker=dict(
+                            color=value_counts.values,
+                            colorscale=px.colors.sequential.Plotly3,
+                        ),
+                    )
+                ]
+            )
+        else:
+            if data[column].dtype in ["int64", "float64"]:
+                # Continuous data - use histogram
+                fig = go.Figure()
+                fig.add_trace(
+                    go.Histogram(
+                        x=data[column],
+                        name="Count",
+                        nbinsx=30,
+                        marker=dict(
+                            color="rgba(110, 68, 255, 0.7)",
+                            line=dict(color="rgba(184, 146, 255, 1)", width=1),
+                        ),
+                    )
+                )
+            else:
+                # Categorical data
+                value_counts = data[column].value_counts()
+                fig = go.Figure(
+                    [
+                        go.Bar(
+                            x=value_counts.index,
+                            y=value_counts.values,
+                            marker=dict(
+                                color=value_counts.values,
+                                colorscale=px.colors.sequential.Plotly3,
+                            ),
+                        )
+                    ]
+                )
+        # Common layout updates
+        fig.update_layout(
+            title=dict(text=f"Distribution of {column}", x=0.5, y=0.95),
+            xaxis_title=column,
+            yaxis_title="Count",
+            template="plotly_white",
+            margin=dict(t=50, l=50, r=30, b=50),
+            width=600,
+            height=400,
+            showlegend=False,
+            plot_bgcolor="rgba(0,0,0,0)",
+            paper_bgcolor="rgba(0,0,0,0)",
+        )
+        # Rotate x-axis labels if needed
+        if isinstance(data[column].iloc[0], list) or data[column].dtype not in [
+            "int64",
+            "float64",
+        ]:
+            fig.update_layout(xaxis_tickangle=-45)
+        # Update grid style
+        fig.update_yaxes(gridcolor="rgba(128,128,128,0.1)", gridwidth=1)
+        fig.update_xaxes(gridcolor="rgba(128,128,128,0.1)", gridwidth=1)
+        # Convert to PNG with moderate resolution
+        img_bytes = fig.to_image(format="png", scale=1.5)
+        # Encode to base64
+        img_base64 = base64.b64encode(img_bytes).decode()
+        return img_base64
+    except Exception as e:
+        print(f"Error creating distribution plot for {column}: {str(e)}")
+        raise e
+def create_wordcloud(data, column):
+    """Create a word cloud visualization."""
+    from wordcloud import WordCloud
+    import matplotlib.pyplot as plt
+    try:
+        # Handle list columns
+        if isinstance(data[column].iloc[0], list):
+            text = " ".join(
+                [
+                    " ".join(map(str, sublist))
+                    for sublist in data[column]
+                    if isinstance(sublist, list)
+                ]
+            )
+        else:
+            # Handle regular columns
+            text = " ".join(data[column].astype(str))
+        wordcloud = WordCloud(
+            width=600,
+            height=300,
+            background_color="white",
+            colormap="plasma",
+            max_words=100,
+        ).generate(text)
+        # Create matplotlib figure
+        plt.figure(figsize=(8, 4))
+        plt.imshow(wordcloud, interpolation="bilinear")
+        plt.axis("off")
+        plt.title(f"Word Cloud for {column}")
+        # Save to bytes
+        buf = io.BytesIO()
+        plt.savefig(buf, format="png", bbox_inches="tight", dpi=150)
+        plt.close()
+        buf.seek(0)
+        # Convert to base64
+        img_base64 = base64.b64encode(buf.getvalue()).decode()
+        return img_base64
+    except Exception as e:
+        print(f"Error creating word cloud for {column}: {str(e)}")
+        raise e
+def analyze_dataset_with_openai(client: OpenAI, dataset_sample) -> dict:
+    """Analyze dataset sample using OpenAI API."""
+    # Get a single record for schema inference
+    single_record = (
+        dataset_sample[0] if isinstance(dataset_sample, list) else dataset_sample
+    )
+    # Convert the full sample to JSON for overview analysis
+    sample_json = json.dumps(dataset_sample, indent=2)
+    single_record_json = json.dumps(single_record, indent=2)
+    prompt = f"""Analyze this dataset sample and provide the following in a JSON response:
+    1. A concise description that includes:
+       - A one-sentence overview of what the dataset contains
+       - A bullet-pointed list of key features and statistics
+       - A brief statement about potential ML/AI applications
+    2. A schema showing each field's type and description. Use this single record for type inference:
+    {single_record_json}
+    For schema types, use precise types like:
+    - "string" for text fields
+    - "number" for numeric fields
+    - "boolean" for true/false
+    - "array of X" for arrays where X is the type of elements
+    - "object" for nested objects, with nested field descriptions
+    3. A formatted example record
+    Format your response as JSON with these exact keys:
+    {{
+        "description": {{
+            "overview": "One clear sentence describing the dataset...",
+            "key_features": [
+                "Feature or statistic 1",
+                "Feature or statistic 2"
+            ],
+            "ml_applications": "Brief statement about ML/AI use cases..."
+        }},
+        "schema": {{
+            "field_name": {{
+                "type": "precise type as described above",
+                "description": "Description of what this field contains"
+            }}
+        }},
+        "example": {{"key": "value"}}
+    }}
+    For context, here are more sample records to help with the overview and features:
+    {sample_json}
+    """
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.7,
+            max_tokens=2000,
+        )
+        # Get the response content
+        response_text = response.choices[0].message.content
+        print("OpenAI Response:", response_text)
+        # Extract JSON from the response
+        json_str = extract_json_from_response(response_text)
+        print("Extracted JSON:", json_str)
+        # Parse the JSON
+        result = json.loads(json_str)
+        print("Parsed Result:", result)
+        return result
+    except Exception as e:
+        print(f"OpenAI API error: {str(e)}")
+        return {
+            "description": {
+                "overview": "Error analyzing dataset",
+                "key_features": ["Error: Failed to analyze dataset"],
+                "ml_applications": "Analysis unavailable",
+            },
+            "schema": {},
+            "example": {},
+        }
+def analyze_dataset_statistics(df):
+    """Generate simplified dataset statistics with token counting."""
+    stats = {
+        "basic_stats": {
+            "total_records": len(df),
+            "total_features": len(df.columns),
+            "memory_usage": f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB"
+        },
+        "token_stats": {
+            "total": 0,
+            "by_column": {}
+        }
+    }
+    # Count tokens for each column
+    for column in df.columns:
+        try:
+            if df[column].dtype == 'object' or isinstance(df[column].iloc[0], list):
+                # For list columns, join items into strings
+                if isinstance(df[column].iloc[0], list):
+                    token_counts = df[column].apply(lambda x: count_tokens(' '.join(str(item) for item in x)))
+                else:
+                    token_counts = df[column].apply(lambda x: count_tokens(str(x)))
+                total_tokens = int(token_counts.sum())
+                stats["token_stats"]["total"] += total_tokens
+                stats["token_stats"]["by_column"][column] = total_tokens
+        except Exception as e:
+            print(f"Error processing column {column}: {str(e)}")
+            continue
+    return stats
+def format_dataset_stats(stats):
+    """Format simplified dataset statistics as markdown."""
+    md = """## Dataset Overview
+### Basic Statistics
+* Total Records: {total_records:,}
+* Total Features: {total_features}
+* Memory Usage: {memory_usage}
+""".format(**stats["basic_stats"])
+    # Token Statistics
+    if stats["token_stats"]["total"] > 0:
+        md += "\n### Token Info\n"
+        md += f"* Total Tokens: {stats['token_stats']['total']:,}\n"
+        if stats["token_stats"]["by_column"]:
+            md += "\nTokens by Column:\n"
+            for col, count in stats["token_stats"]["by_column"].items():
+                md += f"* {col}: {count:,}\n"
+    return md
+def generate_dataset_card(
+    dataset_info: dict,
+    distribution_plots: dict,
+    wordcloud_plots: dict,
+    openai_analysis: dict,
+    df: pd.DataFrame,
+) -> str:
+    """Generate the complete dataset card content."""
+    yaml_content = {
+        "language": ["en"],
+        "license": "apache-2.0",
+        "multilinguality": "monolingual",
+        "size_categories": ["1K<n<10K"],
+        "task_categories": ["other"],
+    }
+    yaml_string = yaml.dump(yaml_content, sort_keys=False)
+    description = openai_analysis["description"]
+    # Generate schema table
+    schema_table = generate_schema_table(openai_analysis["schema"])
+    # Format example as JSON code block
+    example_block = f"```json\n{json.dumps(openai_analysis['example'], indent=2)}\n```"
+    # Generate dataset statistics
+    stats = analyze_dataset_statistics(df)
+    stats_section = format_dataset_stats(stats)
+    # Add distribution plots inline
+    distribution_plots_md = ""
+    if distribution_plots:
+        distribution_plots_md = "\n### Distribution Plots\n\n"
+        distribution_plots_md += '<div style="display: grid; grid-template-columns: repeat(1, 1fr); gap: 20px;">\n'
+        for col, img_str in distribution_plots.items():
+            distribution_plots_md += f"<div>\n"
+            distribution_plots_md += f"<h4>Distribution of {col}</h4>\n"
+            distribution_plots_md += f'<img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">\n'
+            distribution_plots_md += "</div>\n"
+        distribution_plots_md += "</div>\n\n"
+    # Add word clouds inline in a grid
+    wordcloud_plots_md = ""
+    if wordcloud_plots:
+        wordcloud_plots_md = "\n### Word Clouds\n\n"
+        wordcloud_plots_md += '<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px;">\n'
+        for col, img_str in wordcloud_plots.items():
+            wordcloud_plots_md += f"<div>\n"
+            wordcloud_plots_md += f"<h4>Word Cloud for {col}</h4>\n"
+            wordcloud_plots_md += f'<img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">\n'
+            wordcloud_plots_md += "</div>\n"
+        wordcloud_plots_md += "</div>\n\n"
+    # Generate clean dataset name for citation
+    clean_dataset_name = dataset_info["dataset_name"].replace("/", "_")
+    # Build the markdown content
+    readme_content = f"""---
+{yaml_string}---
+# {dataset_info['dataset_name']}
+{description['overview']}
+The dataset includes:
+{chr(10).join(f'* {feature}' for feature in description['key_features'])}
+{description['ml_applications']}
+## Dataset Schema
+{schema_table}
+## Example Record
+{example_block}
+## Data Distribution Analysis
+The following visualizations show key characteristics of the dataset:
+{distribution_plots_md}
+{wordcloud_plots_md}
+{stats_section}
+## Citation and Usage
+If you use this dataset in your research or applications, please cite it as:
+```bibtex
+@dataset{{{clean_dataset_name},
+    title = {{{dataset_info['dataset_name']}}},
+    author = {{Dataset Authors}},
+    year = {{{datetime.datetime.now().year}}},
+    publisher = {{Hugging Face}},
+    howpublished = {{Hugging Face Datasets}},
+    url = {{https://huggingface.co/datasets/{dataset_info['dataset_name']}}}
+}}
+```
+### Usage Guidelines
+This dataset is released under the Apache 2.0 License. When using this dataset:
+* 📚 Cite the dataset using the BibTeX entry above
+* 🤝 Consider contributing improvements or reporting issues
+* 💡 Share derivative works with the community when possible
+For questions or additional information, please visit the dataset repository on Hugging Face.
+"""
+    return readme_content
+def generate_schema_table(schema: dict) -> str:
+    """Generate a markdown table for the schema, handling nested structures."""
+    # Table header
+    table = "| Field | Type | Description |\n| --- | --- | --- |\n"
+    # Generate rows recursively
+    rows = []
+    for field, info in schema.items():
+        rows.extend(format_schema_item(field, info))
+    # Join all rows
+    table += "\n".join(rows)
+    return table
+def format_schema_item(field_name: str, field_info: dict, prefix: str = "") -> list:
+    """Recursively format schema items for nested structures."""
+    rows = []
+    # Handle nested objects
+    if isinstance(field_info, dict):
+        if "type" in field_info and "description" in field_info:
+            # This is a leaf node with type and description
+            rows.append(
+                f"| {prefix}{field_name} | {field_info['type']} | {field_info['description']} |"
+            )
+        else:
+            # This is a nested object, recurse through its properties
+            for subfield, subinfo in field_info.items():
+                if prefix:
+                    new_prefix = f"{prefix}{field_name}."
+                else:
+                    new_prefix = f"{field_name}."
+                rows.extend(format_schema_item(subfield, subinfo, new_prefix))
+    return rows
+def flatten_list_column(data, column):
+    """Flatten a column containing lists into individual values with counts."""
+    # Flatten the lists into individual items
+    flattened = [
+        item
+        for sublist in data[column]
+        if isinstance(sublist, list)
+        for item in sublist
+    ]
+    # Count occurrences
+    value_counts = pd.Series(Counter(flattened))
+    return value_counts

src/utils/visualization.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import plotly.express as px
+import plotly.graph_objects as go
+import pandas as pd
+import base64
+import io
+import plotly.express as px
+import plotly.graph_objects as go
+import pandas as pd
+import base64
+import io
+from collections import Counter
+def flatten_list_column(data, column):
+    """Flatten a column containing lists into individual values with counts."""
+    # Flatten the lists into individual items
+    flattened = [item for sublist in data[column] if isinstance(sublist, list) for item in sublist]
+    # Count occurrences
+    value_counts = pd.Series(Counter(flattened))
+    return value_counts
+def create_distribution_plot(data, column):
+    """Create a beautiful distribution plot using Plotly and convert to image."""
+    try:
+        # Check if the column contains lists
+        if isinstance(data[column].iloc[0], list):
+            print(f"Processing list column: {column}")
+            value_counts = flatten_list_column(data, column)
+        else:
+            # Handle regular columns
+            if data[column].dtype in ['int64', 'float64']:
+                # Continuous data - use histogram
+                fig = go.Figure()
+                # Add histogram
+                fig.add_trace(go.Histogram(
+                    x=data[column],
+                    name='Count',
+                    nbinsx=30,
+                    marker=dict(
+                        color='rgba(110, 68, 255, 0.7)',
+                        line=dict(color='rgba(184, 146, 255, 1)', width=1)
+                    )
+                ))
+            else:
+                # Categorical data
+                value_counts = data[column].value_counts()
+        # For both list columns and categorical data
+        if 'value_counts' in locals():
+            fig = go.Figure([go.Bar(
+                x=value_counts.index,
+                y=value_counts.values,
+                marker=dict(
+                    color=value_counts.values,
+                    colorscale=px.colors.sequential.Plotly3,
+                ),
+            )])
+        # Common layout updates
+        fig.update_layout(
+            title=f'Distribution of {column}',
+            xaxis_title=column,
+            yaxis_title='Count',
+            template='plotly_white',
+            margin=dict(t=50, l=50, r=50, b=50),
+            width=1200,
+            height=800,
+            showlegend=False
+        )
+        # Rotate x-axis labels if needed
+        if isinstance(data[column].iloc[0], list) or data[column].dtype not in ['int64', 'float64']:
+            fig.update_layout(xaxis_tickangle=-45)
+        # Convert to PNG
+        img_bytes = fig.to_image(format="png", scale=2.0)
+        # Encode to base64
+        img_base64 = base64.b64encode(img_bytes).decode()
+        return img_base64
+    except Exception as e:
+        print(f"Error creating distribution plot for {column}: {str(e)}")
+        raise e
+def create_wordcloud(data, column):
+    """Create a word cloud visualization."""
+    from wordcloud import WordCloud
+    import matplotlib.pyplot as plt
+    try:
+        # Handle list columns
+        if isinstance(data[column].iloc[0], list):
+            text = ' '.join([' '.join(map(str, sublist)) for sublist in data[column] if isinstance(sublist, list)])
+        else:
+            # Handle regular columns
+            text = ' '.join(data[column].astype(str))
+        wordcloud = WordCloud(
+            width=1200,
+            height=800,
+            background_color='white',
+            colormap='plasma',
+            max_words=100
+        ).generate(text)
+        # Create matplotlib figure
+        plt.figure(figsize=(10, 5))
+        plt.imshow(wordcloud, interpolation='bilinear')
+        plt.axis('off')
+        plt.title(f'Word Cloud for {column}')
+        # Save to bytes
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png', bbox_inches='tight', dpi=300)
+        plt.close()
+        buf.seek(0)
+        # Convert to base64
+        img_base64 = base64.b64encode(buf.getvalue()).decode()
+        return img_base64
+    except Exception as e:
+        print(f"Error creating word cloud for {column}: {str(e)}")
+        raise e
+def create_wordcloud(data, column):
+    """Create a word cloud visualization."""
+    from wordcloud import WordCloud
+    import matplotlib.pyplot as plt
+    # Generate word cloud
+    text = " ".join(data[column].astype(str))
+    wordcloud = WordCloud(
+        width=800,
+        height=400,
+        background_color="white",
+        colormap="plasma",
+        max_words=100,
+    ).generate(text)
+    # Create matplotlib figure
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation="bilinear")
+    plt.axis("off")
+    plt.title(f"Word Cloud for {column}")
+    # Save to bytes
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png", bbox_inches="tight", dpi=300)
+    plt.close()
+    buf.seek(0)
+    # Convert to base64
+    img_base64 = base64.b64encode(buf.getvalue()).decode()
+    return img_base64