Spaces:
Running
Running
{ | |
"metadata": { | |
"Name": "Gemma 2", | |
"Provider": "Google", | |
"URL": "https://ai.google.dev/gemma/docs/model_card_2", | |
"Type": "Large Language Model", | |
"Modalities": [ | |
"Text-to-Text" | |
] | |
}, | |
"scores": { | |
"1. Bias, Stereotypes, and Representational Harms Evaluation": { | |
"1.1 Bias Detection Overview": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://ai.google.dev/gemma/docs/model_card_2#data_preprocessing", | |
"name": "Model Card - Data Preprocessing" | |
}, | |
{ | |
"type": "π", | |
"detail": "https://developers.googleblog.com/en/gemma-explained-new-in-gemma-2/", | |
"name": "Developer Blog" | |
}, | |
{ | |
"type": "π", | |
"detail": "https://arxiv.org/html/2410.12864", | |
"name": "Bias Analysis Paper" | |
} | |
], | |
"questions": { | |
"Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)": true, | |
"Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)": true, | |
"Have extrinsic bias evaluations been run (e.g., downstream task performance)": true, | |
"Have evaluations been run across all applicable modalities": true, | |
"Have bias evaluations been run that take the form of automatic quantitative evaluation": true, | |
"Have bias evaluations been run with human participants?": true | |
} | |
}, | |
"1.2 Protected Classes and Intersectional Measures": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://ai.google.dev/gemma/docs/model_card_2#evaluation_results", | |
"name": "Model Card - Evaluation Results" | |
} | |
], | |
"questions": { | |
"Do evaluations cover all applicable legal protected categories for in-scope uses of the system?": true, | |
"Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics": false, | |
"Evaluation of how different aspects of identity interact and compound in AI system behavior": false, | |
"Evaluation of AI system biases for legal protected categories and additional relevant subgroups": false | |
} | |
}, | |
"1.3 Measurement of Stereotypes and Harmful Associations": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://arxiv.org/abs/2009.11462", | |
"name": "Stereotype Analysis" | |
} | |
], | |
"questions": { | |
"Measurement of known stereotypes in AI system outputs": true, | |
"Measurement of other negative associations and assumptions regarding specific groups": true, | |
"Measurement of stereotypes and negative associations across in-scope contexts": false | |
} | |
}, | |
"1.4 Bias Evaluation Transparency and Documentation": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://arxiv.org/pdf/2403.13793", | |
"name": "Evaluation Documentation" | |
} | |
], | |
"questions": { | |
"Sufficient documentation of evaluation method to understand the scope of the findings": false, | |
"Sufficient documentation of evaluation methods to replicate findings": true, | |
"Sufficient documentation of evaluation results to support comparison": true, | |
"Documentation of bias mitigation measures": false, | |
"Documentation of bias monitoring approaches": false | |
} | |
} | |
}, | |
"2. Cultural Values and Sensitive Content Evaluation": { | |
"2.1 Cultural Variation Overview": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://aclanthology.org/2024.findings-emnlp.942.pdf", | |
"name": "Cultural Variation Analysis" | |
} | |
], | |
"questions": { | |
"Evaluations at various stages": false, | |
"Have intrinsic properties been evaluated for cultural variation": false, | |
"Have extrinsic cultural variation evaluations been run": true, | |
"Have evaluations been run across all applicable modalities": true, | |
"Have cultural variation evaluations been run that take the form of automatic quantitative evaluation": true, | |
"Have cultural variation evaluations been run with human participants?": false | |
} | |
}, | |
"2.2 Cultural Diversity and Representation": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Use of evaluation methods developed in the cultural contexts in scope": false, | |
"Respect of indigenous sovereignty, protected rights, and cultural norms": false, | |
"Evaluation of cultural variation across geographic dimensions": false, | |
"Evaluation of cultural variation representing communities' perspectives": false, | |
"Analysis of how cultural context affects AI system performance": false | |
} | |
}, | |
"2.3 Generated Sensitive Content across Cultural Contexts": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://arxiv.org/html/2408.00118v1#S6", | |
"name": "Content Safety Analysis" | |
} | |
], | |
"questions": { | |
"Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content": true, | |
"Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination": false, | |
"Has the AI system been evaluated for its likelihood of facilitating generation of hate speech": false, | |
"Has the AI system been evaluated for content embedding values not reflective of user cultural context": false, | |
"Has the AI system been evaluated for exposing users to inappropriate content": false, | |
"Has the AI system been evaluated for content with negative psychological impacts": true, | |
"Has the evaluation explicitly addressed cultural variation": false | |
} | |
}, | |
"2.4 Cultural Variation Transparency and Documentation": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Documentation of cultural contexts considered during development": false, | |
"Documentation of cultural contexts covered by evaluations": false, | |
"Sufficient documentation of evaluation method": false, | |
"Sufficient documentation of evaluation methods to replicate findings": false, | |
"Sufficient documentation of evaluation results": false, | |
"Documentation of psychological impact on evaluators": false, | |
"Documentation of evaluator well-being measures": false | |
} | |
} | |
}, | |
"3. Disparate Performance Evaluation": { | |
"3.1 Disparate Performance Overview": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Have development choices been evaluated for disparate performance contribution": false, | |
"Have extrinsic disparate performance evaluations been run": false, | |
"Have evaluations been run across all applicable modalities": false, | |
"Have disparate performance evaluations been run quantitatively": false, | |
"Have disparate performance evaluations been run with human participants": false | |
} | |
}, | |
"3.2 Identifying Target Groups": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Identification of mandated target groups": false, | |
"Identification of additional potentially harmed groups": false, | |
"Assessment of systemic barriers in data collection": false, | |
"Consideration of historical disparities": false, | |
"Identification of implicit and explicit markers": false | |
} | |
}, | |
"3.3 Subgroup Performance Analysis": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Non-aggregated evaluation results across subpopulations": false, | |
"Metrics for decision-making tasks": false, | |
"Metrics for other tasks including generative": false, | |
"Worst-case subgroup performance analysis": false, | |
"Intersectional analysis": false, | |
"Evaluation of implicit social group markers": false | |
} | |
}, | |
"3.4 Transparency and Documentation": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Documentation of evaluation method scope": false, | |
"Documentation of evaluation methods for replication": false, | |
"Documentation of evaluation results for comparison": false, | |
"Documentation of mitigation measures": false, | |
"Documentation of monitoring approaches": false | |
} | |
} | |
}, | |
"4. Environmental Costs and Carbon Emissions Evaluation": { | |
"4.1 Environmental Costs Overview": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Evaluations of different processes": false, | |
"Evaluations across modalities": false, | |
"Evaluations on standardized benchmarks": false, | |
"Community feedback consideration": false, | |
"Full supply chain consideration": false | |
} | |
}, | |
"4.2 Development Impact": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"FLOPS accounting": false, | |
"Energy consumption evaluation": false, | |
"Carbon impact evaluation": false, | |
"Hardware lifecycle evaluation": false | |
} | |
}, | |
"4.3 Deployment Impact": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://cloud.google.com/blog/products/ai-machine-learning/performance-deepdive-of-gemma-on-google-cloud", | |
"name": "Performance Analysis" | |
} | |
], | |
"questions": { | |
"Evaluation of inference FLOPS": true, | |
"Evaluation of common deployment energy consumption": false, | |
"Evaluation across deployment settings": false, | |
"Evaluation of task-specific variations": false, | |
"Evaluation of deployment carbon impact": false, | |
"Evaluation of deployment hardware lifecycle": false | |
} | |
}, | |
"4.4 Documentation": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Equipment and infrastructure documentation": false, | |
"Evaluation methods documentation": false, | |
"Results documentation": false, | |
"Documentation for comparison": false | |
} | |
} | |
}, | |
"5. Privacy and Data Protection Evaluation": { | |
"5.1 Overview": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://arxiv.org/pdf/2408.00118", | |
"name": "Privacy Evaluation" | |
} | |
], | |
"questions": { | |
"Evaluations at various stages": true, | |
"Intrinsic privacy vulnerability evaluation": false, | |
"Extrinsic privacy evaluations": true, | |
"Evaluations across modalities": false, | |
"Quantitative privacy evaluations": true, | |
"Human participant privacy evaluations": false | |
} | |
}, | |
"5.2 Privacy Harms": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://arxiv.org/pdf/2408.00118", | |
"name": "Privacy Analysis" | |
} | |
], | |
"questions": { | |
"Personal information revelation evaluation": true, | |
"Content impersonation evaluation": true, | |
"Personal information confabulation evaluation": true | |
} | |
}, | |
"5.3 IP and Security": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://www.cio.com/article/3567106/latticeflow-launches-first-comprehensive-evaluation-framework-for-compliance-with-the-eu-ai-act.html", | |
"name": "Security Evaluation" | |
} | |
], | |
"questions": { | |
"Training data reproduction evaluation": true, | |
"Information security risk evaluation": false | |
} | |
}, | |
"5.4 Documentation": { | |
"status": "Yes", | |
"sources": [ | |
{ | |
"type": "π", | |
"detail": "https://ai.google.dev/gemma/docs/model_card_2", | |
"name": "Model Card Documentation" | |
} | |
], | |
"questions": { | |
"Evaluation methods documentation": false, | |
"Results documentation": false, | |
"Limitations documentation": true, | |
"Deployment considerations documentation": false, | |
"Training data documentation": false | |
} | |
} | |
}, | |
"6. Financial Costs Evaluation": { | |
"6.1 Overview": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Cost evaluation across stages": false, | |
"Component cost evaluation": false, | |
"Modality cost evaluation": false, | |
"Direct and indirect expense evaluation": false, | |
"Cost projection validation": false | |
} | |
}, | |
"6.2 Development Costs": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"R&D labor costs": false, | |
"Data collection costs": false, | |
"Infrastructure costs": false, | |
"Training approach costs": false, | |
"Architecture impact costs": false | |
} | |
}, | |
"6.3 Operation Costs": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Inference costs": false, | |
"Storage costs": false, | |
"Scaling costs": false, | |
"Deployment context costs": false, | |
"Update costs": false | |
} | |
}, | |
"6.4 Documentation": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Methodology documentation": false, | |
"Cost breakdown documentation": false, | |
"Usage scenario documentation": false, | |
"Projection documentation": false | |
} | |
} | |
}, | |
"7. Data and Content Moderation Labor Evaluation": { | |
"7.1 Overview": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Labor practice evaluation": false, | |
"Worker category evaluation": false, | |
"Task type evaluation": false, | |
"Industry standard evaluation": false, | |
"Worker type evaluation": false, | |
"Regional context evaluation": false | |
} | |
}, | |
"7.2 Working Conditions": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Compensation assessment": false, | |
"Job security assessment": false, | |
"Workplace safety evaluation": false, | |
"Worker autonomy assessment": false, | |
"Power dynamics evaluation": false | |
} | |
}, | |
"7.3 Worker Wellbeing": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Support system assessment": false, | |
"Content preparation evaluation": false, | |
"Cultural support evaluation": false | |
} | |
}, | |
"7.4 Documentation": { | |
"status": "N/A", | |
"sources": [], | |
"questions": { | |
"Methodology documentation": false, | |
"Demographics documentation": false, | |
"Support system documentation": false, | |
"Incident reporting documentation": false | |
} | |
} | |
} | |
} | |
} | |