{ "metadata": { "Name": "Gemma 2", "Provider": "Google", "URL": "https://ai.google.dev/gemma/docs/model_card_2", "Type": "Large Language Model", "Modalities": [ "Text-to-Text" ] }, "scores": { "1. Bias, Stereotypes, and Representational Harms Evaluation": { "1.1 Bias Detection Overview": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://ai.google.dev/gemma/docs/model_card_2#data_preprocessing", "name": "Model Card - Data Preprocessing" }, { "type": "🌐", "detail": "https://developers.googleblog.com/en/gemma-explained-new-in-gemma-2/", "name": "Developer Blog" }, { "type": "🌐", "detail": "https://arxiv.org/html/2410.12864", "name": "Bias Analysis Paper" } ], "questions": { "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true, "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": true, "Have extrinsic bias evaluations been run (e.g., downstream task performance)": true, "Have evaluations been run across all applicable modalities": true, "Have bias evaluations been run that take the form of automatic quantitative evaluation": true, "Have bias evaluations been run with human participants?": true } }, "1.2 Protected Classes and Intersectional Measures": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://ai.google.dev/gemma/docs/model_card_2#evaluation_results", "name": "Model Card - Evaluation Results" } ], "questions": { "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": true, "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false, "Evaluation of how different aspects of identity interact and compound in AI system behavior": false, "Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false } }, "1.3 Measurement of Stereotypes and Harmful Associations": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://arxiv.org/abs/2009.11462", "name": "Stereotype Analysis" } ], "questions": { "Measurement of known stereotypes in AI system outputs": true, "Measurement of other negative associations and assumptions regarding specific groups": true, "Measurement of stereotypes and negative associations across in-scope contexts": false } }, "1.4 Bias Evaluation Transparency and Documentation": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://arxiv.org/pdf/2403.13793", "name": "Evaluation Documentation" } ], "questions": { "Sufficient documentation of evaluation method to understand the scope of the findings": false, "Sufficient documentation of evaluation methods to replicate findings": true, "Sufficient documentation of evaluation results to support comparison": true, "Documentation of bias mitigation measures": false, "Documentation of bias monitoring approaches": false } } }, "2. Cultural Values and Sensitive Content Evaluation": { "2.1 Cultural Variation Overview": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://aclanthology.org/2024.findings-emnlp.942.pdf", "name": "Cultural Variation Analysis" } ], "questions": { "Evaluations at various stages": false, "Have intrinsic properties been evaluated for cultural variation": false, "Have extrinsic cultural variation evaluations been run": true, "Have evaluations been run across all applicable modalities": true, "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": true, "Have cultural variation evaluations been run with human participants?": false } }, "2.2 Cultural Diversity and Representation": { "status": "N/A", "sources": [], "questions": { "Use of evaluation methods developed in the cultural contexts in scope": false, "Respect of indigenous sovereignty, protected rights, and cultural norms": false, "Evaluation of cultural variation across geographic dimensions": false, "Evaluation of cultural variation representing communities' perspectives": false, "Analysis of how cultural context affects AI system performance": false } }, "2.3 Generated Sensitive Content across Cultural Contexts": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://arxiv.org/html/2408.00118v1#S6", "name": "Content Safety Analysis" } ], "questions": { "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true, "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false, "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false, "Has the AI system been evaluated for content embedding values not reflective of user cultural context": false, "Has the AI system been evaluated for exposing users to inappropriate content": false, "Has the AI system been evaluated for content with negative psychological impacts": true, "Has the evaluation explicitly addressed cultural variation": false } }, "2.4 Cultural Variation Transparency and Documentation": { "status": "N/A", "sources": [], "questions": { "Documentation of cultural contexts considered during development": false, "Documentation of cultural contexts covered by evaluations": false, "Sufficient documentation of evaluation method": false, "Sufficient documentation of evaluation methods to replicate findings": false, "Sufficient documentation of evaluation results": false, "Documentation of psychological impact on evaluators": false, "Documentation of evaluator well-being measures": false } } }, "3. Disparate Performance Evaluation": { "3.1 Disparate Performance Overview": { "status": "N/A", "sources": [], "questions": { "Have development choices been evaluated for disparate performance contribution": false, "Have extrinsic disparate performance evaluations been run": false, "Have evaluations been run across all applicable modalities": false, "Have disparate performance evaluations been run quantitatively": false, "Have disparate performance evaluations been run with human participants": false } }, "3.2 Identifying Target Groups": { "status": "N/A", "sources": [], "questions": { "Identification of mandated target groups": false, "Identification of additional potentially harmed groups": false, "Assessment of systemic barriers in data collection": false, "Consideration of historical disparities": false, "Identification of implicit and explicit markers": false } }, "3.3 Subgroup Performance Analysis": { "status": "N/A", "sources": [], "questions": { "Non-aggregated evaluation results across subpopulations": false, "Metrics for decision-making tasks": false, "Metrics for other tasks including generative": false, "Worst-case subgroup performance analysis": false, "Intersectional analysis": false, "Evaluation of implicit social group markers": false } }, "3.4 Transparency and Documentation": { "status": "N/A", "sources": [], "questions": { "Documentation of evaluation method scope": false, "Documentation of evaluation methods for replication": false, "Documentation of evaluation results for comparison": false, "Documentation of mitigation measures": false, "Documentation of monitoring approaches": false } } }, "4. Environmental Costs and Carbon Emissions Evaluation": { "4.1 Environmental Costs Overview": { "status": "N/A", "sources": [], "questions": { "Evaluations of different processes": false, "Evaluations across modalities": false, "Evaluations on standardized benchmarks": false, "Community feedback consideration": false, "Full supply chain consideration": false } }, "4.2 Development Impact": { "status": "N/A", "sources": [], "questions": { "FLOPS accounting": false, "Energy consumption evaluation": false, "Carbon impact evaluation": false, "Hardware lifecycle evaluation": false } }, "4.3 Deployment Impact": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://cloud.google.com/blog/products/ai-machine-learning/performance-deepdive-of-gemma-on-google-cloud", "name": "Performance Analysis" } ], "questions": { "Evaluation of inference FLOPS": true, "Evaluation of common deployment energy consumption": false, "Evaluation across deployment settings": false, "Evaluation of task-specific variations": false, "Evaluation of deployment carbon impact": false, "Evaluation of deployment hardware lifecycle": false } }, "4.4 Documentation": { "status": "N/A", "sources": [], "questions": { "Equipment and infrastructure documentation": false, "Evaluation methods documentation": false, "Results documentation": false, "Documentation for comparison": false } } }, "5. Privacy and Data Protection Evaluation": { "5.1 Overview": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://arxiv.org/pdf/2408.00118", "name": "Privacy Evaluation" } ], "questions": { "Evaluations at various stages": true, "Intrinsic privacy vulnerability evaluation": false, "Extrinsic privacy evaluations": true, "Evaluations across modalities": false, "Quantitative privacy evaluations": true, "Human participant privacy evaluations": false } }, "5.2 Privacy Harms": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://arxiv.org/pdf/2408.00118", "name": "Privacy Analysis" } ], "questions": { "Personal information revelation evaluation": true, "Content impersonation evaluation": true, "Personal information confabulation evaluation": true } }, "5.3 IP and Security": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://www.cio.com/article/3567106/latticeflow-launches-first-comprehensive-evaluation-framework-for-compliance-with-the-eu-ai-act.html", "name": "Security Evaluation" } ], "questions": { "Training data reproduction evaluation": true, "Information security risk evaluation": false } }, "5.4 Documentation": { "status": "Yes", "sources": [ { "type": "🌐", "detail": "https://ai.google.dev/gemma/docs/model_card_2", "name": "Model Card Documentation" } ], "questions": { "Evaluation methods documentation": false, "Results documentation": false, "Limitations documentation": true, "Deployment considerations documentation": false, "Training data documentation": false } } }, "6. Financial Costs Evaluation": { "6.1 Overview": { "status": "N/A", "sources": [], "questions": { "Cost evaluation across stages": false, "Component cost evaluation": false, "Modality cost evaluation": false, "Direct and indirect expense evaluation": false, "Cost projection validation": false } }, "6.2 Development Costs": { "status": "N/A", "sources": [], "questions": { "R&D labor costs": false, "Data collection costs": false, "Infrastructure costs": false, "Training approach costs": false, "Architecture impact costs": false } }, "6.3 Operation Costs": { "status": "N/A", "sources": [], "questions": { "Inference costs": false, "Storage costs": false, "Scaling costs": false, "Deployment context costs": false, "Update costs": false } }, "6.4 Documentation": { "status": "N/A", "sources": [], "questions": { "Methodology documentation": false, "Cost breakdown documentation": false, "Usage scenario documentation": false, "Projection documentation": false } } }, "7. Data and Content Moderation Labor Evaluation": { "7.1 Overview": { "status": "N/A", "sources": [], "questions": { "Labor practice evaluation": false, "Worker category evaluation": false, "Task type evaluation": false, "Industry standard evaluation": false, "Worker type evaluation": false, "Regional context evaluation": false } }, "7.2 Working Conditions": { "status": "N/A", "sources": [], "questions": { "Compensation assessment": false, "Job security assessment": false, "Workplace safety evaluation": false, "Worker autonomy assessment": false, "Power dynamics evaluation": false } }, "7.3 Worker Wellbeing": { "status": "N/A", "sources": [], "questions": { "Support system assessment": false, "Content preparation evaluation": false, "Cultural support evaluation": false } }, "7.4 Documentation": { "status": "N/A", "sources": [], "questions": { "Methodology documentation": false, "Demographics documentation": false, "Support system documentation": false, "Incident reporting documentation": false } } } } }