marcenacp commited on
Commit
cb5b71d
·
1 Parent(s): c3ac09f

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .streamlit/config.toml +9 -0
  2. Makefile +26 -0
  3. __init__.py +0 -0
  4. app.py +29 -0
  5. components/__init__.py +0 -0
  6. components/tree/__init__.py +36 -0
  7. components/tree/frontend/.env +6 -0
  8. components/tree/frontend/.prettierrc +5 -0
  9. components/tree/frontend/build/asset-manifest.json +10 -0
  10. components/tree/frontend/build/index.html +1 -0
  11. components/tree/frontend/build/static/js/main.5a572f5d.js +0 -0
  12. components/tree/frontend/build/static/js/main.5a572f5d.js.LICENSE.txt +63 -0
  13. components/tree/frontend/build/static/js/main.5a572f5d.js.map +0 -0
  14. components/tree/frontend/package-lock.json +0 -0
  15. components/tree/frontend/package.json +47 -0
  16. components/tree/frontend/public/index.html +27 -0
  17. components/tree/frontend/src/Tree.tsx +215 -0
  18. components/tree/frontend/src/index.tsx +10 -0
  19. components/tree/frontend/src/react-app-env.d.ts +1 -0
  20. components/tree/frontend/tsconfig.json +25 -0
  21. core/__init__.py +0 -0
  22. core/constants.py +10 -0
  23. core/data_types.py +19 -0
  24. core/data_types_test.py +15 -0
  25. core/files.py +154 -0
  26. core/files_test.py +27 -0
  27. core/names.py +8 -0
  28. core/names_test.py +10 -0
  29. core/past_projects.py +34 -0
  30. core/record_sets.py +38 -0
  31. core/state.py +261 -0
  32. cypress.config.js +7 -0
  33. cypress/downloads/croissant-Titanic.json +1 -0
  34. cypress/downloads/croissant.json +1 -0
  35. cypress/e2e/createManually.cy.js +35 -0
  36. cypress/e2e/displayErrors.cy.js +30 -0
  37. cypress/e2e/loadCroissant.cy.js +61 -0
  38. cypress/e2e/renameDistribution.cy.js +36 -0
  39. cypress/e2e/uploadCsv.cy.js +59 -0
  40. cypress/fixtures/base.csv +4 -0
  41. cypress/fixtures/coco.json +409 -0
  42. cypress/fixtures/titanic.json +343 -0
  43. cypress/screenshots/uploadCsv.cy.js/Editor loads a local CSV as a resource -- should display the form Overview, Metadata, Resources, & Record Sets (failed).png +0 -0
  44. cypress/support/e2e.js +6 -0
  45. cypress/support/resize_observer.js +11 -0
  46. events/__init__.py +0 -0
  47. events/fields.py +147 -0
  48. events/metadata.py +28 -0
  49. events/record_sets.py +29 -0
  50. events/resources.py +41 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [browser]
2
+ gatherUsageStats = false
3
+
4
+ [theme]
5
+ primaryColor = "#F29828"
6
+ backgroundColor = "#CCEBD4"
7
+ secondaryBackgroundColor = "#EEF2F9"
8
+ textColor = "#171D30"
9
+ font = "sans serif"
Makefile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ black:
2
+ black \
3
+ --line-length 88 \
4
+ --preview \
5
+ .
6
+
7
+ isort:
8
+ isort \
9
+ --profile google \
10
+ --line-length 88 \
11
+ --use-parentheses \
12
+ --project mlcroissant \
13
+ --project components \
14
+ --project core \
15
+ --project events \
16
+ --project views \
17
+ --project state \
18
+ --project utils \
19
+ --multi-line 3 \
20
+ --thirdparty datasets \
21
+ .
22
+
23
+ format: black isort
24
+
25
+ pytest:
26
+ PYTHONPATH=. pytest
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from core.state import CurrentStep
4
+ from utils import init_state
5
+ from views.splash import render_splash
6
+ from views.wizard import render_editor
7
+
8
+ init_state()
9
+
10
+
11
+ def _back_to_menu():
12
+ """Sends the user back to the menu."""
13
+ init_state(force=True)
14
+
15
+
16
+ st.set_page_config(page_title="Croissant Editor", page_icon="🥐", layout="wide")
17
+ col1, col2 = st.columns([10, 1])
18
+ col1.header("Croissant Editor")
19
+ if st.session_state[CurrentStep] != CurrentStep.splash:
20
+ col2.write("\n") # Vertical box to shift the button menu
21
+ col2.button("Menu", on_click=_back_to_menu)
22
+
23
+
24
+ if st.session_state[CurrentStep] == CurrentStep.splash:
25
+ render_splash()
26
+ elif st.session_state[CurrentStep] == CurrentStep.editor:
27
+ render_editor()
28
+ else:
29
+ st.warning("invalid unhandled app state")
components/__init__.py ADDED
File without changes
components/tree/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import streamlit.components.v1 as components
4
+
5
+ # Create a _RELEASE constant. We'll set this to False while we're developing
6
+ # the component, and True when we're ready to package and distribute it.
7
+ _RELEASE = True
8
+
9
+ if not _RELEASE:
10
+ _component_func = components.declare_component(
11
+ "tree_component",
12
+ url="http://localhost:3001",
13
+ )
14
+ else:
15
+ parent_dir = os.path.dirname(os.path.abspath(__file__))
16
+ build_dir = os.path.join(parent_dir, "frontend/build")
17
+ _component_func = components.declare_component("tree_component", path=build_dir)
18
+
19
+
20
+ def render_tree(nodes, key=None):
21
+ """Create a new instance of "tree_component".
22
+
23
+ Args:
24
+ nodes: The nodes to render in the tree. Nodes are dictionaries with keys `name`
25
+ (unique identifier), `type` and `parent` (referencing another name).
26
+ key: An optional key that uniquely identifies this component. If this is
27
+ None, and the component's arguments are changed, the component will
28
+ be re-mounted in the Streamlit frontend and lose its current state.
29
+
30
+ Returns:
31
+ The number of times the component's "Click Me" button has been clicked.
32
+ (This is the value passed to `Streamlit.setComponentValue` on the
33
+ frontend.)
34
+ """
35
+ component_value = _component_func(nodes=nodes, key=key, default=0)
36
+ return component_value
components/tree/frontend/.env ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Run the component's dev server on :3001
2
+ # (The Streamlit dev server already runs on :3000)
3
+ PORT=3001
4
+
5
+ # Don't automatically open the web browser on `npm run start`.
6
+ BROWSER=none
components/tree/frontend/.prettierrc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "endOfLine": "lf",
3
+ "semi": false,
4
+ "trailingComma": "es5"
5
+ }
components/tree/frontend/build/asset-manifest.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": {
3
+ "main.js": "./static/js/main.5a572f5d.js",
4
+ "index.html": "./index.html",
5
+ "main.5a572f5d.js.map": "./static/js/main.5a572f5d.js.map"
6
+ },
7
+ "entrypoints": [
8
+ "static/js/main.5a572f5d.js"
9
+ ]
10
+ }
components/tree/frontend/build/index.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <!doctype html><html lang="en"><head><title>Streamlit Tree Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.5a572f5d.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
components/tree/frontend/build/static/js/main.5a572f5d.js ADDED
The diff for this file is too large to render. See raw diff
 
components/tree/frontend/build/static/js/main.5a572f5d.js.LICENSE.txt ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ object-assign
3
+ (c) Sindre Sorhus
4
+ @license MIT
5
+ */
6
+
7
+ /**
8
+ * @license React
9
+ * react-dom.production.min.js
10
+ *
11
+ * Copyright (c) Facebook, Inc. and its affiliates.
12
+ *
13
+ * This source code is licensed under the MIT license found in the
14
+ * LICENSE file in the root directory of this source tree.
15
+ */
16
+
17
+ /**
18
+ * @license React
19
+ * react-jsx-runtime.production.min.js
20
+ *
21
+ * Copyright (c) Facebook, Inc. and its affiliates.
22
+ *
23
+ * This source code is licensed under the MIT license found in the
24
+ * LICENSE file in the root directory of this source tree.
25
+ */
26
+
27
+ /**
28
+ * @license React
29
+ * react.production.min.js
30
+ *
31
+ * Copyright (c) Facebook, Inc. and its affiliates.
32
+ *
33
+ * This source code is licensed under the MIT license found in the
34
+ * LICENSE file in the root directory of this source tree.
35
+ */
36
+
37
+ /**
38
+ * @license React
39
+ * scheduler.production.min.js
40
+ *
41
+ * Copyright (c) Facebook, Inc. and its affiliates.
42
+ *
43
+ * This source code is licensed under the MIT license found in the
44
+ * LICENSE file in the root directory of this source tree.
45
+ */
46
+
47
+ /** @license React v16.13.1
48
+ * react-is.production.min.js
49
+ *
50
+ * Copyright (c) Facebook, Inc. and its affiliates.
51
+ *
52
+ * This source code is licensed under the MIT license found in the
53
+ * LICENSE file in the root directory of this source tree.
54
+ */
55
+
56
+ /** @license React v16.14.0
57
+ * react.production.min.js
58
+ *
59
+ * Copyright (c) Facebook, Inc. and its affiliates.
60
+ *
61
+ * This source code is licensed under the MIT license found in the
62
+ * LICENSE file in the root directory of this source tree.
63
+ */
components/tree/frontend/build/static/js/main.5a572f5d.js.map ADDED
The diff for this file is too large to render. See raw diff
 
components/tree/frontend/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
components/tree/frontend/package.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "tree_component",
3
+ "version": "0.1.0",
4
+ "private": true,
5
+ "dependencies": {
6
+ "@mui/icons-material": "^5.14.16",
7
+ "@mui/material": "^5.14.17",
8
+ "@mui/x-tree-view": "^6.17.0",
9
+ "react": "^18.2.0",
10
+ "react-dom": "^18.2.0",
11
+ "streamlit-component-lib": "^2.0.0"
12
+ },
13
+ "scripts": {
14
+ "start": "react-scripts start",
15
+ "build": "react-scripts build",
16
+ "test": "react-scripts test",
17
+ "eject": "react-scripts eject"
18
+ },
19
+ "eslintConfig": {
20
+ "extends": "react-app"
21
+ },
22
+ "browserslist": {
23
+ "production": [
24
+ ">0.2%",
25
+ "not dead",
26
+ "not op_mini all"
27
+ ],
28
+ "development": [
29
+ "last 1 chrome version",
30
+ "last 1 firefox version",
31
+ "last 1 safari version"
32
+ ]
33
+ },
34
+ "homepage": ".",
35
+ "devDependencies": {
36
+ "@types/node": "^20.9.0",
37
+ "@types/react": "^18.2.37",
38
+ "@types/react-dom": "^18.2.15",
39
+ "react-scripts": "^5.0.1",
40
+ "typescript": "^5.2.2"
41
+ },
42
+ "overrides": {
43
+ "react-scripts": {
44
+ "typescript": "^5"
45
+ }
46
+ }
47
+ }
components/tree/frontend/public/index.html ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <title>Streamlit Tree Component</title>
6
+ <meta charset="UTF-8" />
7
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
8
+ <meta name="theme-color" content="#000000" />
9
+ <meta name="description" content="Streamlit Tree Component" />
10
+ </head>
11
+
12
+ <body>
13
+ <noscript>You need to enable JavaScript to run this app.</noscript>
14
+ <div id="root"></div>
15
+ <!--
16
+ This HTML file is a template.
17
+ If you open it directly in the browser, you will see an empty page.
18
+
19
+ You can add webfonts, meta tags, or analytics to this file.
20
+ The build step will place the bundled scripts into the <body> tag.
21
+
22
+ To begin the development, run `npm start` or `yarn start`.
23
+ To create a production bundle, use `npm run build` or `yarn build`.
24
+ -->
25
+ </body>
26
+
27
+ </html>
components/tree/frontend/src/Tree.tsx ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ Streamlit,
3
+ StreamlitComponentBase,
4
+ withStreamlitConnection,
5
+ } from "streamlit-component-lib"
6
+ import React, { ReactNode } from "react"
7
+ import { styled, useTheme } from "@mui/material/styles"
8
+ import Box from "@mui/material/Box"
9
+ import Typography from "@mui/material/Typography"
10
+ import FileCopyIcon from "@mui/icons-material/FileCopy"
11
+ import InsertDriveFileIcon from "@mui/icons-material/InsertDriveFile"
12
+ import ArrowDropDownIcon from "@mui/icons-material/ArrowDropDown"
13
+ import ArrowRightIcon from "@mui/icons-material/ArrowRight"
14
+ import { SvgIconProps } from "@mui/material/SvgIcon"
15
+ import { TreeView } from "@mui/x-tree-view/TreeView"
16
+ import {
17
+ TreeItem,
18
+ TreeItemProps,
19
+ treeItemClasses,
20
+ } from "@mui/x-tree-view/TreeItem"
21
+
22
+ // All code related to the MUI tree component is taken from https://mui.com/x/react-tree-view.
23
+ declare module "react" {
24
+ interface CSSProperties {
25
+ "--tree-view-color"?: string
26
+ "--tree-view-bg-color"?: string
27
+ }
28
+ }
29
+
30
+ type StyledTreeItemProps = TreeItemProps & {
31
+ bgColor?: string
32
+ bgColorForDarkMode?: string
33
+ color?: string
34
+ colorForDarkMode?: string
35
+ labelIcon: React.ElementType<SvgIconProps>
36
+ labelInfo?: string
37
+ labelText: string
38
+ }
39
+
40
+ const StyledTreeItemRoot = styled(TreeItem)(({ theme }) => ({
41
+ color: theme.palette.text.secondary,
42
+ [`& .${treeItemClasses.content}`]: {
43
+ color: theme.palette.text.secondary,
44
+ borderTopRightRadius: theme.spacing(2),
45
+ borderBottomRightRadius: theme.spacing(2),
46
+ paddingRight: theme.spacing(1),
47
+ fontWeight: theme.typography.fontWeightMedium,
48
+ "&.Mui-expanded": {
49
+ fontWeight: theme.typography.fontWeightRegular,
50
+ },
51
+ "&:hover": {
52
+ backgroundColor: theme.palette.action.hover,
53
+ },
54
+ "&.Mui-focused, &.Mui-selected, &.Mui-selected.Mui-focused": {
55
+ backgroundColor: `var(--tree-view-bg-color, ${theme.palette.action.selected})`,
56
+ color: "var(--tree-view-color)",
57
+ },
58
+ [`& .${treeItemClasses.label}`]: {
59
+ fontWeight: "inherit",
60
+ color: "inherit",
61
+ },
62
+ },
63
+ [`& .${treeItemClasses.group}`]: {
64
+ marginLeft: 0,
65
+ [`& .${treeItemClasses.content}`]: {
66
+ paddingLeft: theme.spacing(2),
67
+ },
68
+ },
69
+ })) as unknown as typeof TreeItem
70
+
71
+ const StyledTreeItem = React.forwardRef(function StyledTreeItem(
72
+ props: StyledTreeItemProps,
73
+ ref: React.Ref<HTMLLIElement>
74
+ ) {
75
+ const theme = useTheme()
76
+ const {
77
+ bgColor,
78
+ color,
79
+ labelIcon: LabelIcon,
80
+ labelInfo,
81
+ labelText,
82
+ colorForDarkMode,
83
+ bgColorForDarkMode,
84
+ ...other
85
+ } = props
86
+
87
+ const styleProps = {
88
+ "--tree-view-color":
89
+ theme.palette.mode !== "dark" ? color : colorForDarkMode,
90
+ "--tree-view-bg-color":
91
+ theme.palette.mode !== "dark" ? bgColor : bgColorForDarkMode,
92
+ }
93
+
94
+ return (
95
+ <StyledTreeItemRoot
96
+ label={
97
+ <Box
98
+ sx={{
99
+ display: "flex",
100
+ alignItems: "center",
101
+ p: 0.5,
102
+ pr: 0,
103
+ }}
104
+ >
105
+ <Box component={LabelIcon} color="inherit" sx={{ mr: 1 }} />
106
+ <Typography
107
+ data-testid="tree-element"
108
+ variant="body2"
109
+ sx={{
110
+ whiteSpace: "nowrap",
111
+ overflow: "hidden",
112
+ textOverflow: "ellipsis",
113
+ fontWeight: "inherit",
114
+ flexGrow: 1,
115
+ }}
116
+ >
117
+ {labelText}
118
+ </Typography>
119
+ <Typography variant="caption" color="inherit">
120
+ {labelInfo}
121
+ </Typography>
122
+ </Box>
123
+ }
124
+ style={styleProps}
125
+ {...other}
126
+ ref={ref}
127
+ />
128
+ )
129
+ })
130
+
131
+ type Node = {
132
+ name: string
133
+ type: string
134
+ parents: string[]
135
+ }
136
+
137
+ type TreeNodes = { [key: string]: TreeNode }
138
+
139
+ type TreeNode = Node & {
140
+ children: string[]
141
+ }
142
+
143
+ const TreeNodeComponent = ({
144
+ treeNode,
145
+ treeNodes,
146
+ }: {
147
+ treeNode: TreeNode
148
+ treeNodes: TreeNodes
149
+ }) => {
150
+ const { children } = treeNode
151
+ const childrenNodes = children
152
+ .filter((child) => child in treeNodes)
153
+ .map((child) => treeNodes[child])
154
+ const labelIcon =
155
+ treeNode.type === "FileObject" ? InsertDriveFileIcon : FileCopyIcon
156
+ return (
157
+ <StyledTreeItem
158
+ onClick={() => Streamlit.setComponentValue(treeNode.name)}
159
+ nodeId={treeNode.name}
160
+ labelText={treeNode.name}
161
+ labelIcon={labelIcon}
162
+ >
163
+ {childrenNodes.map((childNode) => (
164
+ <TreeNodeComponent treeNode={childNode} treeNodes={treeNodes} />
165
+ ))}
166
+ </StyledTreeItem>
167
+ )
168
+ }
169
+
170
+ const TreeViewWithNodes = ({ nodes }: { nodes: Node[] }) => {
171
+ const treeNodes: TreeNodes = {}
172
+ nodes.forEach((node) => {
173
+ treeNodes[node.name] = { ...node, children: [] }
174
+ })
175
+ nodes.forEach((node) => {
176
+ node.parents.forEach((parent) => {
177
+ if (parent in treeNodes) {
178
+ treeNodes[parent].children.push(node.name)
179
+ }
180
+ })
181
+ })
182
+
183
+ return (
184
+ <TreeView
185
+ defaultCollapseIcon={<ArrowDropDownIcon />}
186
+ defaultExpandIcon={<ArrowRightIcon />}
187
+ defaultEndIcon={<div style={{ width: 24 }} />}
188
+ expanded={Object.keys(treeNodes)}
189
+ sx={{
190
+ flexGrow: 1,
191
+ margin: -1,
192
+ padding: 1,
193
+ border: "1px solid rgba(23, 29, 48, 0.2)",
194
+ borderRadius: "0.5rem",
195
+ }}
196
+ >
197
+ {Object.values(treeNodes).map((treeNode) => {
198
+ return (
199
+ treeNode.parents.length === 0 && (
200
+ <TreeNodeComponent treeNode={treeNode} treeNodes={treeNodes} />
201
+ )
202
+ )
203
+ })}
204
+ </TreeView>
205
+ )
206
+ }
207
+
208
+ class Tree extends StreamlitComponentBase<{}> {
209
+ public render = (): ReactNode => {
210
+ const nodes = this.props.args["nodes"]
211
+ return <TreeViewWithNodes nodes={nodes} />
212
+ }
213
+ }
214
+
215
+ export default withStreamlitConnection(Tree)
components/tree/frontend/src/index.tsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from "react"
2
+ import ReactDOM from "react-dom"
3
+ import Tree from "./Tree"
4
+
5
+ ReactDOM.render(
6
+ <React.StrictMode>
7
+ <Tree />
8
+ </React.StrictMode>,
9
+ document.getElementById("root")
10
+ )
components/tree/frontend/src/react-app-env.d.ts ADDED
@@ -0,0 +1 @@
 
 
1
+ /// <reference types="react-scripts" />
components/tree/frontend/tsconfig.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "es5",
4
+ "lib": [
5
+ "dom",
6
+ "dom.iterable",
7
+ "esnext"
8
+ ],
9
+ "allowJs": true,
10
+ "skipLibCheck": true,
11
+ "esModuleInterop": true,
12
+ "allowSyntheticDefaultImports": true,
13
+ "strict": true,
14
+ "forceConsistentCasingInFileNames": true,
15
+ "module": "esnext",
16
+ "moduleResolution": "node",
17
+ "resolveJsonModule": true,
18
+ "isolatedModules": true,
19
+ "noEmit": true,
20
+ "jsx": "react"
21
+ },
22
+ "include": [
23
+ "src"
24
+ ]
25
+ }
core/__init__.py ADDED
File without changes
core/constants.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from etils import epath
2
+
3
+ import mlcroissant as mlc
4
+
5
+ EDITOR_CACHE: epath.Path = mlc.constants.CROISSANT_CACHE / "editor"
6
+ PAST_PROJECTS_PATH: epath.Path = EDITOR_CACHE / "projects"
7
+ PROJECT_FOLDER_PATTERN = "%Y%m%d%H%M%S%f"
8
+
9
+
10
+ DF_HEIGHT = 150
core/data_types.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import numpy as np
4
+
5
+ import mlcroissant as mlc
6
+
7
+
8
+ def convert_dtype(dtype: Any):
9
+ """Converts from NumPy/Pandas to Croissant data types."""
10
+ if dtype == np.int64:
11
+ return mlc.DataType.INTEGER
12
+ elif dtype == np.float64:
13
+ return mlc.DataType.FLOAT
14
+ elif dtype == np.bool_:
15
+ return mlc.DataType.BOOL
16
+ elif dtype == np.str_ or dtype == object:
17
+ return mlc.DataType.TEXT
18
+ else:
19
+ raise NotImplementedError(dtype)
core/data_types_test.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for data_types."""
2
+
3
+ import numpy as np
4
+ import pytest
5
+
6
+ from .data_types import convert_dtype
7
+
8
+
9
+ def test_convert_dtype():
10
+ convert_dtype(np.int64) == "https://schema.org/Integer"
11
+ convert_dtype(np.float64) == "https://schema.org/Float"
12
+ convert_dtype(np.bool_) == "https://schema.org/Boolean"
13
+ convert_dtype(np.str_) == "https://schema.org/Text"
14
+ with pytest.raises(NotImplementedError):
15
+ convert_dtype(np.float32)
core/files.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import hashlib
3
+ import io
4
+ import tempfile
5
+
6
+ from etils import epath
7
+ import pandas as pd
8
+ import requests
9
+
10
+ from .names import find_unique_name
11
+ from .state import FileObject
12
+ from .state import FileSet
13
+
14
+ FILE_OBJECT = "File object"
15
+ FILE_SET = "File set"
16
+ RESOURCE_TYPES = [FILE_OBJECT, FILE_SET]
17
+
18
+
19
+ @dataclasses.dataclass
20
+ class FileType:
21
+ name: str
22
+ encoding_format: str
23
+ extensions: list[str]
24
+
25
+
26
+ class FileTypes:
27
+ CSV = FileType(name="CSV", encoding_format="text/csv", extensions=["csv"])
28
+ EXCEL = FileType(
29
+ name="Excel",
30
+ encoding_format="application/vnd.ms-excel",
31
+ extensions=["xls", "xlsx", "xlsm"],
32
+ )
33
+ JSON = FileType(
34
+ name="JSON", encoding_format="application/json", extensions=["json"]
35
+ )
36
+ JSONL = FileType(
37
+ name="JSON-Lines",
38
+ encoding_format="application/jsonl+json",
39
+ extensions=["jsonl"],
40
+ )
41
+ PARQUET = FileType(
42
+ name="Parquet",
43
+ encoding_format="application/vnd.apache.parquet",
44
+ extensions=["parquet"],
45
+ )
46
+
47
+
48
+ FILE_TYPES: dict[str, FileType] = {
49
+ file_type.name: file_type
50
+ for file_type in [
51
+ FileTypes.CSV,
52
+ FileTypes.EXCEL,
53
+ FileTypes.JSON,
54
+ FileTypes.JSONL,
55
+ FileTypes.PARQUET,
56
+ ]
57
+ }
58
+
59
+
60
+ def _sha256(content: bytes):
61
+ """Computes the sha256 digest of the byte string."""
62
+ return hashlib.sha256(content).hexdigest()
63
+
64
+
65
+ def hash_file_path(url: str) -> epath.Path:
66
+ """Reproducibly produces the file path."""
67
+ tempdir = epath.Path(tempfile.gettempdir())
68
+ hash = _sha256(url.encode())
69
+ return tempdir / f"croissant-editor-{hash}"
70
+
71
+
72
+ def download_file(url: str, file_path: epath.Path):
73
+ """Downloads the file locally to `file_path`."""
74
+ with requests.get(url, stream=True) as request:
75
+ request.raise_for_status()
76
+ with tempfile.TemporaryDirectory() as tmpdir:
77
+ tmpdir = epath.Path(tmpdir) / "file"
78
+ with tmpdir.open("wb") as file:
79
+ for chunk in request.iter_content(chunk_size=8192):
80
+ file.write(chunk)
81
+ tmpdir.copy(file_path)
82
+
83
+
84
+ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
85
+ """Gets the df associated to the file."""
86
+ if file_type == FileTypes.CSV:
87
+ return pd.read_csv(file)
88
+ elif file_type == FileTypes.EXCEL:
89
+ return pd.read_excel(file)
90
+ elif file_type == FileTypes.JSON:
91
+ return pd.read_json(file)
92
+ elif file_type == FileTypes.JSONL:
93
+ return pd.read_json(file, lines=True)
94
+ elif file_type == FileTypes.PARQUET:
95
+ return pd.read_parquet(file)
96
+ else:
97
+ raise NotImplementedError()
98
+
99
+
100
+ def file_from_url(file_type: FileType, url: str, names: set[str]) -> FileObject:
101
+ """Downloads locally and extracts the file information."""
102
+ file_path = hash_file_path(url)
103
+ if not file_path.exists():
104
+ download_file(url, file_path)
105
+ with file_path.open("rb") as file:
106
+ sha256 = _sha256(file.read())
107
+ df = get_dataframe(file_type, file_path).infer_objects()
108
+ return FileObject(
109
+ name=find_unique_name(names, url.split("/")[-1]),
110
+ description="",
111
+ content_url=url,
112
+ encoding_format=file_type.encoding_format,
113
+ sha256=sha256,
114
+ df=df,
115
+ )
116
+
117
+
118
+ def file_from_upload(
119
+ file_type: FileType, file: io.BytesIO, names: set[str]
120
+ ) -> FileObject:
121
+ """Uploads locally and extracts the file information."""
122
+ sha256 = _sha256(file.getvalue())
123
+ df = get_dataframe(file_type, file).infer_objects()
124
+ return FileObject(
125
+ name=find_unique_name(names, file.name),
126
+ description="",
127
+ content_url=f"data/{file.name}",
128
+ encoding_format=file_type.encoding_format,
129
+ sha256=sha256,
130
+ df=df,
131
+ )
132
+
133
+
134
+ def file_from_form(
135
+ file_type: FileType, type: str, name, description, sha256: str, names: set[str]
136
+ ) -> FileObject | FileSet:
137
+ """Creates a file based on manually added fields."""
138
+ if type == FILE_OBJECT:
139
+ return FileObject(
140
+ name=find_unique_name(names, name),
141
+ description=description,
142
+ content_url="",
143
+ encoding_format=file_type.encoding_format,
144
+ sha256=sha256,
145
+ df=None,
146
+ )
147
+ elif type == FILE_SET:
148
+ return FileSet(
149
+ name=find_unique_name(names, name),
150
+ description=description,
151
+ encoding_format=file_type.encoding_format,
152
+ )
153
+ else:
154
+ raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
core/files_test.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from etils import epath
2
+ import pandas as pd
3
+ import pytest
4
+
5
+ from .files import file_from_url
6
+ from .files import FileTypes
7
+
8
+
9
+ def test_check_file_csv():
10
+ csv = epath.Path(
11
+ # This is the hash path for "https://my.url".
12
+ "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
13
+ )
14
+ if csv.exists():
15
+ csv.unlink()
16
+ with csv.open("w") as f:
17
+ f.write("column1,column2\n")
18
+ f.write("a,1\n")
19
+ f.write("b,2\n")
20
+ f.write("c,3\n")
21
+ file = file_from_url(FileTypes.CSV, "https://my.url", set())
22
+ pd.testing.assert_frame_equal(
23
+ file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
24
+ )
25
+ # Fails with unknown encoding_format:
26
+ with pytest.raises(NotImplementedError):
27
+ file_from_url("unknown", "https://my.url", set())
core/names.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """Module to handle naming of RecordSets and distribution."""
2
+
3
+
4
+ def find_unique_name(names: set[str], name: str):
5
+ """Find a unique UID."""
6
+ while name in names:
7
+ name = f"{name}_0"
8
+ return name
core/names_test.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for `names` module."""
2
+
3
+ from .names import find_unique_name
4
+
5
+
6
+ def test_find_unique_name():
7
+ names = set(["first", "second", "first_0"])
8
+ assert find_unique_name(names, "first") == "first_0_0"
9
+ assert find_unique_name(names, "second") == "second_0"
10
+ assert find_unique_name(names, "third") == "third"
core/past_projects.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pickle
3
+
4
+ from etils import epath
5
+ import streamlit as st
6
+
7
+ from core.constants import PAST_PROJECTS_PATH
8
+ from core.state import CurrentProject
9
+ from core.state import Metadata
10
+
11
+
12
+ def load_past_projects_paths() -> list[epath.Path]:
13
+ PAST_PROJECTS_PATH.mkdir(parents=True, exist_ok=True)
14
+ return sorted(list(PAST_PROJECTS_PATH.iterdir()), reverse=True)
15
+
16
+
17
+ def _pickle_file(path: epath.Path) -> epath.Path:
18
+ return path / ".metadata.pkl"
19
+
20
+
21
+ def save_current_project():
22
+ metadata = st.session_state[Metadata]
23
+ project = st.session_state[CurrentProject]
24
+ project.path.mkdir(parents=True, exist_ok=True)
25
+ with _pickle_file(project.path).open("wb") as file:
26
+ try:
27
+ pickle.dump(metadata, file)
28
+ except pickle.PicklingError:
29
+ logging.error("Could not pickle metadata.")
30
+
31
+
32
+ def open_project(path: epath.Path) -> Metadata:
33
+ with _pickle_file(path).open("rb") as file:
34
+ return pickle.load(file)
core/record_sets.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from core.data_types import convert_dtype
2
+ from core.names import find_unique_name
3
+ from core.state import Field
4
+ from core.state import FileObject
5
+ from core.state import FileSet
6
+ from core.state import RecordSet
7
+ import mlcroissant as mlc
8
+
9
+
10
+ def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[RecordSet]:
11
+ """Infers one or several ml:RecordSets from a FileOject/FileSet."""
12
+ # For the moment, there is no inference support for FileSets.
13
+ if isinstance(file, FileSet):
14
+ return []
15
+ # We can infer only if the underlying `pd.DataFrame` could be built.
16
+ if file.df is None:
17
+ return []
18
+ fields = []
19
+ for column, value in file.df.dtypes.items():
20
+ source = mlc.Source(
21
+ uid=file.name,
22
+ node_type="distribution",
23
+ extract=mlc.Extract(column=column),
24
+ )
25
+ field = Field(
26
+ name=column,
27
+ data_types=[convert_dtype(value)],
28
+ source=source,
29
+ references=mlc.Source(),
30
+ )
31
+ fields.append(field)
32
+ return [
33
+ RecordSet(
34
+ fields=fields,
35
+ name=find_unique_name(names, file.name + "_record_set"),
36
+ description="",
37
+ )
38
+ ]
core/state.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit session state.
2
+
3
+ In the future, this could be the serialization format between front and back.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import dataclasses
9
+ import datetime
10
+ from typing import Any
11
+
12
+ from etils import epath
13
+ import pandas as pd
14
+
15
+ from core.constants import PAST_PROJECTS_PATH
16
+ from core.constants import PROJECT_FOLDER_PATTERN
17
+ import mlcroissant as mlc
18
+
19
+
20
+ def create_class(mlc_class: type, instance: Any, **kwargs) -> Any:
21
+ """Creates the mlcroissant class `mlc_class` from the editor `instance`."""
22
+ fields = dataclasses.fields(mlc_class)
23
+ params: dict[str, Any] = {}
24
+ for field in fields:
25
+ name = field.name
26
+ if hasattr(instance, name) and name not in kwargs:
27
+ params[name] = getattr(instance, name)
28
+ return mlc_class(**params, **kwargs)
29
+
30
+
31
+ class CurrentStep:
32
+ """Holds all major state variables for the application."""
33
+
34
+ splash = "splash"
35
+ editor = "editor"
36
+
37
+
38
+ @dataclasses.dataclass
39
+ class CurrentProject:
40
+ """The selected project."""
41
+
42
+ path: epath.Path
43
+
44
+ @classmethod
45
+ def create_new(cls) -> CurrentProject:
46
+ timestamp = datetime.datetime.now().strftime(PROJECT_FOLDER_PATTERN)
47
+ return CurrentProject(path=PAST_PROJECTS_PATH / timestamp)
48
+
49
+
50
+ class SelectedResource:
51
+ """The selected FileSet or FileObject on the `Resources` page."""
52
+
53
+ pass
54
+
55
+
56
+ @dataclasses.dataclass
57
+ class SelectedRecordSet:
58
+ """The selected RecordSet on the `RecordSets` page."""
59
+
60
+ record_set_key: int
61
+ record_set: RecordSet
62
+
63
+
64
+ @dataclasses.dataclass
65
+ class FileObject:
66
+ """FileObject analogue for editor"""
67
+
68
+ name: str | None = None
69
+ description: str | None = None
70
+ contained_in: list[str] | None = dataclasses.field(default_factory=list)
71
+ content_size: str | None = None
72
+ content_url: str | None = None
73
+ encoding_format: str | None = None
74
+ sha256: str | None = None
75
+ df: pd.DataFrame | None = None
76
+ rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
77
+
78
+
79
+ @dataclasses.dataclass
80
+ class FileSet:
81
+ """FileSet analogue for editor"""
82
+
83
+ contained_in: list[str] = dataclasses.field(default_factory=list)
84
+ description: str | None = None
85
+ encoding_format: str | None = ""
86
+ includes: str | None = ""
87
+ name: str = ""
88
+ rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
89
+
90
+
91
+ @dataclasses.dataclass
92
+ class Field:
93
+ """Field analogue for editor"""
94
+
95
+ name: str | None = None
96
+ description: str | None = None
97
+ data_types: str | list[str] | None = None
98
+ source: mlc.Source | None = None
99
+ rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
100
+ references: mlc.Source | None = None
101
+
102
+
103
+ @dataclasses.dataclass
104
+ class RecordSet:
105
+ """Record Set analogue for editor"""
106
+
107
+ name: str = ""
108
+ data: Any = None
109
+ description: str | None = None
110
+ is_enumeration: bool | None = None
111
+ key: str | list[str] | None = None
112
+ fields: list[Field] = dataclasses.field(default_factory=list)
113
+ rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
114
+
115
+
116
+ @dataclasses.dataclass
117
+ class Metadata:
118
+ """main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""
119
+
120
+ name: str = ""
121
+ description: str | None = None
122
+ citation: str | None = None
123
+ license: str | None = ""
124
+ url: str = ""
125
+ distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
126
+ record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
127
+ rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
128
+
129
+ def __bool__(self):
130
+ return self.name != "" and self.url != ""
131
+
132
+ def rename_distribution(self, old_name: str, new_name: str):
133
+ """Renames a resource by changing all the references to this resource."""
134
+ # Update other resources:
135
+ for i, resource in enumerate(self.distribution):
136
+ contained_in = resource.contained_in
137
+ if contained_in and old_name in contained_in:
138
+ self.distribution[i].contained_in = [
139
+ new_name if name == old_name else name for name in contained_in
140
+ ]
141
+ # Updating source/references works just as with RecordSets.
142
+ self.rename_record_set(old_name, new_name)
143
+
144
+ def rename_record_set(self, old_name: str, new_name: str):
145
+ """Renames a RecordSet by changing all the references to this RecordSet."""
146
+ for i, record_set in enumerate(self.record_sets):
147
+ for j, field in enumerate(record_set.fields):
148
+ # Update source
149
+ source = field.source
150
+ if source and source.uid and source.uid.startswith(old_name):
151
+ new_uid = source.uid.replace(old_name, new_name, 1)
152
+ self.record_sets[i].fields[j].source.uid = new_uid
153
+ # Update references
154
+ references = field.references
155
+ if (
156
+ references
157
+ and references.uid
158
+ and references.uid.startswith(old_name)
159
+ ):
160
+ new_uid = references.uid.replace(old_name, new_name, 1)
161
+ self.record_sets[i].fields[j].references.uid = new_uid
162
+
163
+ def rename_field(self, old_name: str, new_name: str):
164
+ """Renames a field by changing all the references to this field."""
165
+ for i, record_set in enumerate(self.record_sets):
166
+ for j, field in enumerate(record_set.fields):
167
+ # Update source
168
+ source = field.source
169
+ # The difference with RecordSet is the `.endswith` here:
170
+ if (
171
+ source
172
+ and source.uid
173
+ and "/" in source.uid
174
+ and source.uid.endswith(old_name)
175
+ ):
176
+ new_uid = source.uid.replace(old_name, new_name, 1)
177
+ self.record_sets[i].fields[j].source.uid = new_uid
178
+ # Update references
179
+ references = field.references
180
+ if (
181
+ references
182
+ and references.uid
183
+ and "/" in references.uid
184
+ and references.uid.endswith(old_name)
185
+ ):
186
+ new_uid = references.uid.replace(old_name, new_name, 1)
187
+ self.record_sets[i].fields[j].references.uid = new_uid
188
+
189
+ def add_distribution(self, distribution: FileSet | FileObject) -> None:
190
+ self.distribution.append(distribution)
191
+
192
+ def remove_distribution(self, key: int) -> None:
193
+ del self.distribution[key]
194
+
195
+ def add_record_set(self, record_set: RecordSet) -> None:
196
+ self.record_sets.append(record_set)
197
+
198
+ def remove_record_set(self, key: int) -> None:
199
+ del self.record_sets[key]
200
+
201
+ def _find_record_set(self, record_set_key: int) -> RecordSet:
202
+ if record_set_key >= len(self.record_sets):
203
+ raise ValueError(f"Wrong index when finding a RecordSet: {record_set_key}")
204
+ return self.record_sets[record_set_key]
205
+
206
+ def add_field(self, record_set_key: int, field: Field) -> None:
207
+ record_set = self._find_record_set(record_set_key)
208
+ record_set.fields.append(field)
209
+
210
+ def remove_field(self, record_set_key: int, field_key: int) -> None:
211
+ record_set = self._find_record_set(record_set_key)
212
+ if field_key >= len(record_set.fields):
213
+ raise ValueError(f"Wrong index when removing field: {field_key}")
214
+ del record_set.fields[field_key]
215
+
216
+ def to_canonical(self) -> mlc.Metadata:
217
+ distribution = []
218
+ for file in self.distribution:
219
+ if isinstance(file, FileObject):
220
+ distribution.append(create_class(mlc.FileObject, file))
221
+ elif isinstance(file, FileSet):
222
+ distribution.append(create_class(mlc.FileSet, file))
223
+ record_sets = []
224
+ for record_set in self.record_sets:
225
+ fields = []
226
+ for field in record_set.fields:
227
+ fields.append(create_class(mlc.Field, field))
228
+ record_sets.append(create_class(mlc.RecordSet, record_set, fields=fields))
229
+ return create_class(
230
+ mlc.Metadata,
231
+ self,
232
+ distribution=distribution,
233
+ record_sets=record_sets,
234
+ )
235
+
236
+ @classmethod
237
+ def from_canonical(cls, canonical_metadata: mlc.Metadata) -> Metadata:
238
+ distribution = []
239
+ for file in canonical_metadata.distribution:
240
+ if isinstance(file, mlc.FileObject):
241
+ distribution.append(create_class(FileObject, file))
242
+ else:
243
+ distribution.append(create_class(FileSet, file))
244
+ record_sets = []
245
+ for record_set in canonical_metadata.record_sets:
246
+ fields = []
247
+ for field in record_set.fields:
248
+ fields.append(create_class(Field, field))
249
+ record_sets.append(
250
+ create_class(
251
+ RecordSet,
252
+ record_set,
253
+ fields=fields,
254
+ )
255
+ )
256
+ return create_class(
257
+ cls,
258
+ canonical_metadata,
259
+ distribution=distribution,
260
+ record_sets=record_sets,
261
+ )
cypress.config.js ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ const { defineConfig } = require("cypress");
2
+
3
+ module.exports = defineConfig({
4
+ // To access content within Streamlit iframes for custom components:
5
+ chromeWebSecurity: false,
6
+ e2e: {},
7
+ });
cypress/downloads/croissant-Titanic.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"@context": {"@language": "en", "@vocab": "https://schema.org/", "column": "ml:column", "data": {"@id": "ml:data", "@type": "@json"}, "dataType": {"@id": "ml:dataType", "@type": "@vocab"}, "extract": "ml:extract", "field": "ml:field", "fileProperty": "ml:fileProperty", "format": "ml:format", "includes": "ml:includes", "isEnumeration": "ml:isEnumeration", "jsonPath": "ml:jsonPath", "ml": "http://mlcommons.org/schema/", "parentField": "ml:parentField", "path": "ml:path", "recordSet": "ml:recordSet", "references": "ml:references", "regex": "ml:regex", "repeated": "ml:repeated", "replace": "ml:replace", "sc": "https://schema.org/", "separator": "ml:separator", "source": "ml:source", "subField": "ml:subField", "transform": "ml:transform", "wd": "https://www.wikidata.org/wiki/"}, "@type": "sc:Dataset", "name": "Titanic", "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "Public", "url": "https://www.openml.org/d/40945", "distribution": [{"@type": "sc:FileObject", "name": "passengers.csv", "contentSize": "117743 B", "contentUrl": "https://www.openml.org/data/get_csv/16826755/phpMYEkMl", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}, {"@type": "sc:FileObject", "name": "genders.csv", "description": "Maps gender values (\"male\", \"female\") to semantic URLs.", "contentSize": "117743 B", "contentUrl": "data/genders.csv", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}, {"@type": "sc:FileObject", "name": "embarkation_ports.csv", "description": "Maps Embarkation port initial to labeled values.", "contentSize": "117743 B", "contentUrl": "data/embarkation_ports.csv", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}], "recordSet": [{"@type": "ml:RecordSet", "name": "genders", "description": "Maps gender labels to semantic definitions.", "isEnumeration": true, "key": "label", "field": [{"@type": "ml:Field", "name": "label", "description": "One of {\"male\", \"female\"}", "dataType": ["sc:Text", "sc:name"], "source": {"distribution": "genders.csv", "extract": {"column": "label"}}}, {"@type": "ml:Field", "name": "url", "description": "Corresponding WikiData URL", "dataType": ["sc:URL", "wd:Q48277"], "source": {"distribution": "genders.csv", "extract": {"column": "url"}}}]}, {"@type": "ml:RecordSet", "name": "embarkation_ports", "description": "Maps Embarkation port initial to labeled values.", "isEnumeration": true, "key": "key", "field": [{"@type": "ml:Field", "name": "key", "description": "C, Q, S or ?", "dataType": "sc:Text", "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "key"}}}, {"@type": "ml:Field", "name": "label", "description": "Human-readable label", "dataType": ["sc:Text", "sc:name"], "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "label"}}}, {"@type": "ml:Field", "name": "url", "description": "Corresponding WikiData URL", "dataType": ["sc:URL", "wd:Q515"], "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "url"}}}]}, {"@type": "ml:RecordSet", "name": "passengers", "description": "The list of passengers. Does not include crew members.", "field": [{"@type": "ml:Field", "name": "name", "description": "Name of the passenger", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "name"}}}, {"@type": "ml:Field", "name": "gender", "description": "Gender of passenger (male or female)", "dataType": "sc:Text", "references": {"field": "genders/label"}, "source": {"distribution": "passengers.csv", "extract": {"column": "sex"}}}, {"@type": "ml:Field", "name": "age", "description": "Age of passenger at time of death. It's a string, because some values can be `?`.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "age"}}}, {"@type": "ml:Field", "name": "survived", "description": "Survival status of passenger (0: Lost, 1: Saved)", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "survived"}}}, {"@type": "ml:Field", "name": "pclass", "description": "Passenger Class (1st/2nd/3rd)", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "pclass"}}}, {"@type": "ml:Field", "name": "cabin", "description": "Passenger cabin.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "cabin"}}}, {"@type": "ml:Field", "name": "embarked", "description": "Port of Embarkation (C: Cherbourg, Q: Queenstown, S: Southampton, ?: Unknown).", "dataType": "sc:Text", "references": {"field": "embarkation_ports/key"}, "source": {"distribution": "passengers.csv", "extract": {"column": "embarked"}}}, {"@type": "ml:Field", "name": "fare", "description": "Passenger Fare (British pound). It's a string, because some values can be `?`.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "fare"}}}, {"@type": "ml:Field", "name": "home_destination", "description": "Home and destination", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "home.dest"}}}, {"@type": "ml:Field", "name": "ticket", "description": "Ticket Number, may include a letter.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "ticket"}}}, {"@type": "ml:Field", "name": "num_parents_children", "description": "Number of Parents/Children Aboard", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "parch"}}}, {"@type": "ml:Field", "name": "num_siblings_spouses", "description": "Number of Siblings/Spouses Aboard", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "sibsp"}}}, {"@type": "ml:Field", "name": "boat", "description": "Lifeboat used by passenger", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "boat"}}}, {"@type": "ml:Field", "name": "body", "description": "Body Identification Number", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "body"}}}]}]}
cypress/downloads/croissant.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"@context": {"@language": "en", "@vocab": "https://schema.org/", "column": "ml:column", "data": {"@id": "ml:data", "@type": "@json"}, "dataType": {"@id": "ml:dataType", "@type": "@vocab"}, "extract": "ml:extract", "field": "ml:field", "fileProperty": "ml:fileProperty", "format": "ml:format", "includes": "ml:includes", "isEnumeration": "ml:isEnumeration", "jsonPath": "ml:jsonPath", "ml": "http://mlcommons.org/schema/", "parentField": "ml:parentField", "path": "ml:path", "recordSet": "ml:recordSet", "references": "ml:references", "regex": "ml:regex", "repeated": "ml:repeated", "replace": "ml:replace", "sc": "https://schema.org/", "separator": "ml:separator", "source": "ml:source", "subField": "ml:subField", "transform": "ml:transform", "wd": "https://www.wikidata.org/wiki/"}, "@type": "sc:Dataset", "name": "Titanic", "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "Public", "url": "https://www.openml.org/d/40945", "distribution": [{"@type": "sc:FileObject", "name": "passengers.csv", "contentSize": "117743 B", "contentUrl": "https://www.openml.org/data/get_csv/16826755/phpMYEkMl", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}, {"@type": "sc:FileObject", "name": "genders.csv", "description": "Maps gender values (\"male\", \"female\") to semantic URLs.", "contentSize": "117743 B", "contentUrl": "data/genders.csv", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}, {"@type": "sc:FileObject", "name": "embarkation_ports.csv", "description": "Maps Embarkation port initial to labeled values.", "contentSize": "117743 B", "contentUrl": "data/embarkation_ports.csv", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}], "recordSet": [{"@type": "ml:RecordSet", "name": "genders", "description": "Maps gender labels to semantic definitions.", "isEnumeration": true, "key": "label", "field": [{"@type": "ml:Field", "name": "label", "description": "One of {\"male\", \"female\"}", "dataType": ["sc:Text", "sc:name"], "source": {"distribution": "genders.csv", "extract": {"column": "label"}}}, {"@type": "ml:Field", "name": "url", "description": "Corresponding WikiData URL", "dataType": ["sc:URL", "wd:Q48277"], "source": {"distribution": "genders.csv", "extract": {"column": "url"}}}]}, {"@type": "ml:RecordSet", "name": "embarkation_ports", "description": "Maps Embarkation port initial to labeled values.", "isEnumeration": true, "key": "key", "field": [{"@type": "ml:Field", "name": "key", "description": "C, Q, S or ?", "dataType": "sc:Text", "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "key"}}}, {"@type": "ml:Field", "name": "label", "description": "Human-readable label", "dataType": ["sc:Text", "sc:name"], "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "label"}}}, {"@type": "ml:Field", "name": "url", "description": "Corresponding WikiData URL", "dataType": ["sc:URL", "wd:Q515"], "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "url"}}}]}, {"@type": "ml:RecordSet", "name": "passengers", "description": "The list of passengers. Does not include crew members.", "field": [{"@type": "ml:Field", "name": "name", "description": "Name of the passenger", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "name"}}}, {"@type": "ml:Field", "name": "gender", "description": "Gender of passenger (male or female)", "dataType": "sc:Text", "references": {"field": "genders/label"}, "source": {"distribution": "passengers.csv", "extract": {"column": "sex"}}}, {"@type": "ml:Field", "name": "age", "description": "Age of passenger at time of death. It's a string, because some values can be `?`.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "age"}}}, {"@type": "ml:Field", "name": "survived", "description": "Survival status of passenger (0: Lost, 1: Saved)", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "survived"}}}, {"@type": "ml:Field", "name": "pclass", "description": "Passenger Class (1st/2nd/3rd)", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "pclass"}}}, {"@type": "ml:Field", "name": "cabin", "description": "Passenger cabin.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "cabin"}}}, {"@type": "ml:Field", "name": "embarked", "description": "Port of Embarkation (C: Cherbourg, Q: Queenstown, S: Southampton, ?: Unknown).", "dataType": "sc:Text", "references": {"field": "embarkation_ports/key"}, "source": {"distribution": "passengers.csv", "extract": {"column": "embarked"}}}, {"@type": "ml:Field", "name": "fare", "description": "Passenger Fare (British pound). It's a string, because some values can be `?`.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "fare"}}}, {"@type": "ml:Field", "name": "home_destination", "description": "Home and destination", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "home.dest"}}}, {"@type": "ml:Field", "name": "ticket", "description": "Ticket Number, may include a letter.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "ticket"}}}, {"@type": "ml:Field", "name": "num_parents_children", "description": "Number of Parents/Children Aboard", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "parch"}}}, {"@type": "ml:Field", "name": "num_siblings_spouses", "description": "Number of Siblings/Spouses Aboard", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "sibsp"}}}, {"@type": "ml:Field", "name": "boat", "description": "Lifeboat used by passenger", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "boat"}}}, {"@type": "ml:Field", "name": "body", "description": "Body Identification Number", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "body"}}}]}]}
cypress/e2e/createManually.cy.js ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /// <reference types="cypress" />
2
+
3
+ import 'cypress-file-upload';
4
+ import 'cypress-iframe';
5
+
6
+
7
+ describe('Create a resource manually', () => {
8
+ it('should allow adding a FileObject resource', () => {
9
+ // Streamlit starts on :8501.
10
+ cy.visit('http://localhost:8501')
11
+ cy.get('button', {timeout: 10000}).contains('Create', {timeout: 10000}).click()
12
+ cy.get('input[aria-label="Name:red[*]"]').type('MyDataset').blur()
13
+ cy.get('[data-testid="stMarkdownContainer"]')
14
+ .contains('Metadata')
15
+ .click()
16
+ cy.get('input[aria-label="URL:red[*]"]').type('https://mydataset.com', {force: true})
17
+
18
+ // Create a resource manually.
19
+ cy.get('[data-testid="stMarkdownContainer"]').contains('Resources').click()
20
+ cy.get('[data-testid="stMarkdownContainer"]').contains('Add manually').click()
21
+
22
+ cy.get('input[aria-label="File name:red[*]"]').type('test.csv').blur()
23
+ cy.get('input[aria-label="SHA256"]').type('abcdefgh1234567').blur()
24
+ cy.get('button').contains('Upload').click()
25
+
26
+ // The file is created, so we can click on it to see the details.
27
+ cy.enter('[title="components.tree.tree_component"]').then(getBody => {
28
+ getBody().contains('test.csv').click()
29
+ })
30
+
31
+ cy.get('input[aria-label="SHA256:red[*]"]')
32
+ .should('be.disabled')
33
+ .should('have.value', 'abcdefgh1234567')
34
+ })
35
+ })
cypress/e2e/displayErrors.cy.js ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /// <reference types="cypress" />
2
+
3
+ import 'cypress-file-upload';
4
+
5
+ describe('load existing errored croissant', () => {
6
+ it('should display errors', () => {
7
+ cy.visit('http://localhost:8501')
8
+
9
+ cy.fixture('coco.json').then((fileContent) => {
10
+ const file = {
11
+ fileContent,
12
+ fileName: 'coco.json', mimeType: 'text/json',
13
+ }
14
+ cy.get(
15
+ "[data-testid='stFileUploadDropzone']",
16
+ ).attachFile(file, {
17
+ force: true,
18
+ subjectType: "drag-n-drop",
19
+ events: ["dragenter", "drop"],
20
+ })
21
+ })
22
+ cy.get('[data-testid="stMarkdownContainer"]').contains("Errors").should('not.exist')
23
+ // Empty the `name` field to create an error:
24
+ cy.get('[data-testid="stMarkdownContainer"]').contains('RecordSets').click()
25
+ cy.contains('split_enums (2 fields)').click()
26
+ cy.get('input[aria-label="Name:red[*]"][value="split_enums"]').should('be.visible').type('{selectall}{backspace}{enter}')
27
+ cy.get('[data-testid="stMarkdownContainer"]').contains('Overview').click()
28
+ cy.get('[data-testid="stMarkdownContainer"]').contains("Errors").should('exist')
29
+ })
30
+ })
cypress/e2e/loadCroissant.cy.js ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /// <reference types="cypress" />
2
+
3
+ import 'cypress-file-upload';
4
+ import * as path from 'path';
5
+
6
+ describe('Editor loads Croissant without Error', () => {
7
+ it('should allow uploading existing croissant files', () => {
8
+ cy.visit('http://localhost:8501')
9
+
10
+ cy.fixture('titanic.json').then((fileContent) => {
11
+ const file = {
12
+ fileContent,
13
+ fileName: 'titanic.json', mimeType: 'text/json',
14
+ }
15
+ cy.get(
16
+ "[data-testid='stFileUploadDropzone']",
17
+ ).attachFile(file, {
18
+ force: true,
19
+ subjectType: "drag-n-drop",
20
+ events: ["dragenter", "drop"],
21
+ })
22
+ })
23
+ cy.get('button').contains('Metadata').click()
24
+
25
+ cy
26
+ .get("[data-testid='element-container']")
27
+ .contains('Titanic')
28
+ .should('exist')
29
+
30
+ })
31
+ it('should download as json', () => {
32
+ cy.visit('http://localhost:8501')
33
+
34
+ cy.fixture('titanic.json').then((fileContent) => {
35
+ const file = {
36
+ fileContent,
37
+ fileName: 'titanic.json', mimeType: 'text/json',
38
+ }
39
+ cy.get(
40
+ "[data-testid='stFileUploadDropzone']",
41
+ ).attachFile(file, {
42
+ force: true,
43
+ subjectType: "drag-n-drop",
44
+ events: ["dragenter", "drop"],
45
+ })
46
+ })
47
+
48
+ cy.get('[data-testid="stException"]').should('not.exist')
49
+
50
+ cy.get('button').contains('Export').should('exist').should('be.visible').click({force: true})
51
+ cy.fixture('titanic.json').then((fileContent) => {
52
+ const downloadsFolder = Cypress.config("downloadsFolder");
53
+ cy.readFile(path.join(downloadsFolder, "croissant-titanic.json"))
54
+ .then((downloadedFile) => {
55
+ downloadedFile = JSON.stringify(downloadedFile)
56
+ return downloadedFile
57
+ })
58
+ .should('deep.equal', JSON.stringify(fileContent))
59
+ })
60
+ })
61
+ })
cypress/e2e/renameDistribution.cy.js ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /// <reference types="cypress" />
2
+
3
+ import 'cypress-file-upload';
4
+ import 'cypress-iframe';
5
+
6
+
7
+ describe('Renaming of FileObjects/FileSets/RecordSets/Fields.', () => {
8
+ it('should rename the FileObject/FileSet everywhere', () => {
9
+ cy.visit('http://localhost:8501')
10
+
11
+ cy.fixture('titanic.json').then((fileContent) => {
12
+ const file = {
13
+ fileContent,
14
+ fileName: 'titanic.json', mimeType: 'text/json',
15
+ }
16
+ cy.get(
17
+ "[data-testid='stFileUploadDropzone']",
18
+ ).attachFile(file, {
19
+ force: true,
20
+ subjectType: "drag-n-drop",
21
+ events: ["dragenter", "drop"],
22
+ })
23
+ })
24
+ cy.get('button').contains('Resources').click()
25
+ cy.enter('[title="components.tree.tree_component"]').then(getBody => {
26
+ // Click on genders.csv
27
+ getBody().contains('genders.csv').click()
28
+ })
29
+ cy.get('input[aria-label="Name:red[*]"][value="genders.csv"]').type('{selectall}{backspace}the-new-name{enter}')
30
+
31
+ cy.get('button').contains('RecordSets').click()
32
+ cy.contains('genders').click()
33
+ cy.contains('Edit fields details').click()
34
+ cy.contains('the-new-name')
35
+ })
36
+ })
cypress/e2e/uploadCsv.cy.js ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /// <reference types="cypress" />
2
+
3
+ import 'cypress-file-upload';
4
+ import 'cypress-iframe';
5
+
6
+
7
+ describe('Editor loads a local CSV as a resource', () => {
8
+ it('should display the form: Overview, Metadata, Resources, & Record Sets', () => {
9
+ // Streamlit starts on :8501.
10
+ cy.visit('http://localhost:8501')
11
+ cy.get('button', {timeout: 10000}).contains('Create', {timeout: 10000}).click()
12
+
13
+ cy.get('input[aria-label="Name:red[*]"]').type('MyDataset').blur()
14
+ cy.get('[data-testid="stMarkdownContainer"]')
15
+ .contains('Metadata')
16
+ .click()
17
+ cy.get('input[aria-label="URL:red[*]"]').type('https://mydataset.com', {force: true})
18
+
19
+ cy.get('[data-testid="stMarkdownContainer"]').contains('Resources').click()
20
+ // Drag and drop mimicking: streamlit/e2e/specs/st_file_uploader.spec.js.
21
+ cy.fixture('base.csv').then((fileContent) => {
22
+ const file = {
23
+ fileContent,
24
+ fileName: 'base.csv', mimeType: 'text/csv',
25
+ }
26
+ cy.get(
27
+ "[data-testid='stFileUploadDropzone']",
28
+ ).attachFile(file, {
29
+ force: true,
30
+ subjectType: "drag-n-drop",
31
+ events: ["dragenter", "drop"],
32
+ })
33
+ })
34
+ cy.get('.uploadedFileData').contains('base.csv')
35
+ cy.get('button').contains('Upload').click()
36
+ // The file is uploaded, so we can click on it to see the details.
37
+ // Waiting a few seconds to wait for the resource to download.
38
+ cy.wait(2000)
39
+ cy.enter('[title="components.tree.tree_component"]').then(getBody => {
40
+ getBody().find('li').should('be.visible').click()
41
+ })
42
+ // For example, we see the first rows:
43
+ cy.contains('First rows of data:')
44
+
45
+ // On the record set page, we see the record set.
46
+ cy.get('[data-testid="stMarkdownContainer"]').contains('RecordSets').click()
47
+ cy.contains('base.csv_record_set (2 fields)').click()
48
+ // We also see the fields with the proper types.
49
+ cy.get('[data-testid="stDataFrameResizable"]').contains("column1")
50
+ cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Text")
51
+ cy.get('[data-testid="stDataFrameResizable"]').contains("column2")
52
+ cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Integer")
53
+
54
+ // I can edit the details of the fields.
55
+ cy.contains('Edit fields details').click()
56
+ cy.get('input[aria-label="Description"]').last().type('This is a nice custom description!{enter}')
57
+ cy.get('[data-testid="glide-cell-2-1"]').contains("This is a nice custom description!")
58
+ })
59
+ })
cypress/fixtures/base.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ column1,column2
2
+ A,1
3
+ B,2
4
+ C,3
cypress/fixtures/coco.json ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "@context": {
3
+ "@language": "en",
4
+ "@vocab": "https://schema.org/",
5
+ "column": "ml:column",
6
+ "data": {
7
+ "@id": "ml:data",
8
+ "@type": "@json"
9
+ },
10
+ "dataType": {
11
+ "@id": "ml:dataType",
12
+ "@type": "@vocab"
13
+ },
14
+ "extract": "ml:extract",
15
+ "field": "ml:field",
16
+ "fileProperty": "ml:fileProperty",
17
+ "format": "ml:format",
18
+ "includes": "ml:includes",
19
+ "isEnumeration": "ml:isEnumeration",
20
+ "jsonPath": "ml:jsonPath",
21
+ "ml": "http://mlcommons.org/schema/",
22
+ "parentField": "ml:parentField",
23
+ "path": "ml:path",
24
+ "recordSet": "ml:recordSet",
25
+ "references": "ml:references",
26
+ "regex": "ml:regex",
27
+ "repeated": "ml:repeated",
28
+ "replace": "ml:replace",
29
+ "sc": "https://schema.org/",
30
+ "separator": "ml:separator",
31
+ "source": "ml:source",
32
+ "subField": "ml:subField",
33
+ "transform": "ml:transform",
34
+ "wd": "https://www.wikidata.org/wiki/"
35
+ },
36
+ "@type": "sc:Dataset",
37
+ "name": "COCO",
38
+ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.",
39
+ "citation": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}",
40
+ "license": [
41
+ "Creative Commons Attribution 4.0 License",
42
+ "https://www.flickr.com/creativecommons/"
43
+ ],
44
+ "url": "https://cocodataset.org/",
45
+ "distribution": [
46
+ {
47
+ "@type": "sc:FileObject",
48
+ "name": "train2014.zip",
49
+ "contentSize": "13510573713 B",
50
+ "contentUrl": "http://images.cocodataset.org/zips/train2014.zip",
51
+ "encodingFormat": "application/zip",
52
+ "sha256": "sha256"
53
+ },
54
+ {
55
+ "@type": "sc:FileObject",
56
+ "name": "val2014.zip",
57
+ "contentSize": "6645013297 B",
58
+ "contentUrl": "http://images.cocodataset.org/zips/val2014.zip",
59
+ "encodingFormat": "application/zip",
60
+ "sha256": "sha256"
61
+ },
62
+ {
63
+ "@type": "sc:FileObject",
64
+ "name": "test2014.zip",
65
+ "contentSize": "6660437059 B",
66
+ "contentUrl": "http://images.cocodataset.org/zips/test2014.zip",
67
+ "encodingFormat": "application/zip",
68
+ "sha256": "sha256"
69
+ },
70
+ {
71
+ "@type": "sc:FileSet",
72
+ "name": "image-files",
73
+ "containedIn": [
74
+ "train2014.zip",
75
+ "val2014.zip",
76
+ "test2014.zip"
77
+ ],
78
+ "encodingFormat": "image/jpeg",
79
+ "includes": "*.jpg"
80
+ },
81
+ {
82
+ "@type": "sc:FileObject",
83
+ "name": "annotations_trainval2014.zip",
84
+ "contentSize": "252872794 B",
85
+ "contentUrl": "http://images.cocodataset.org/annotations/annotations_trainval2014.zip",
86
+ "encodingFormat": "application/zip",
87
+ "sha256": "sha256"
88
+ },
89
+ {
90
+ "@type": "sc:FileSet",
91
+ "name": "caption_annotations-files",
92
+ "containedIn": "annotations_trainval2014.zip",
93
+ "encodingFormat": "application/json",
94
+ "includes": "annotations/captions_(val|train)2014.json"
95
+ },
96
+ {
97
+ "@type": "sc:FileSet",
98
+ "name": "person_keypoints_annotations",
99
+ "containedIn": "annotations_trainval2014.zip",
100
+ "encodingFormat": "application/json",
101
+ "includes": "annotations/person_keypoints_(val|train)2014.json"
102
+ },
103
+ {
104
+ "@type": "sc:FileSet",
105
+ "name": "instancesperson_keypoints_annotations",
106
+ "containedIn": "annotations_trainval2014.zip",
107
+ "encodingFormat": "application/json",
108
+ "includes": "annotations/instances_(val|train)2014.json"
109
+ },
110
+ {
111
+ "@type": "sc:FileObject",
112
+ "name": "image_info_test2014.zip",
113
+ "contentSize": "763464 B",
114
+ "contentUrl": "http://images.cocodataset.org/annotations/image_info_test2014.zip",
115
+ "encodingFormat": "application/zip",
116
+ "sha256": "sha256"
117
+ },
118
+ {
119
+ "@type": "sc:FileSet",
120
+ "name": "imageinfo",
121
+ "containedIn": "image_info_test2014.zip",
122
+ "encodingFormat": "application/json",
123
+ "includes": "annotations/image_info_test.json"
124
+ }
125
+ ],
126
+ "recordSet": [
127
+ {
128
+ "@type": "ml:RecordSet",
129
+ "name": "split_enums",
130
+ "description": "Maps split names to semantic values.",
131
+ "key": "name",
132
+ "field": [
133
+ {
134
+ "@type": "ml:Field",
135
+ "name": "name",
136
+ "description": "One of: train, val, test.",
137
+ "dataType": "sc:Text"
138
+ },
139
+ {
140
+ "@type": "ml:Field",
141
+ "name": "url",
142
+ "description": "Corresponding mlcommons.org definition URL",
143
+ "dataType": [
144
+ "sc:URL",
145
+ "wd:Q3985153"
146
+ ]
147
+ }
148
+ ],
149
+ "data": [
150
+ {
151
+ "name": "train",
152
+ "url": "https://mlcommons.org/definitions/training_split"
153
+ },
154
+ {
155
+ "name": "val",
156
+ "url": "https://mlcommons.org/definitions/validation_split"
157
+ },
158
+ {
159
+ "name": "test",
160
+ "url": "https://mlcommons.org/definitions/test_split"
161
+ }
162
+ ]
163
+ },
164
+ {
165
+ "@type": "ml:RecordSet",
166
+ "name": "images",
167
+ "key": "image_id",
168
+ "field": [
169
+ {
170
+ "@type": "ml:Field",
171
+ "name": "image_id",
172
+ "description": "The filename of the image. eg: COCO_train2014_000000000003.jpg",
173
+ "dataType": "sc:Text",
174
+ "source": {
175
+ "distribution": "image-files",
176
+ "extract": {
177
+ "fileProperty": "filename"
178
+ },
179
+ "transform": {
180
+ "regex": "^COCO_[train|val|test]2014_(\\d+)\\.jpg$"
181
+ }
182
+ }
183
+ },
184
+ {
185
+ "@type": "ml:Field",
186
+ "name": "image_filename",
187
+ "description": "The filename of the image. eg: COCO_train2014_000000000003.jpg",
188
+ "dataType": "sc:Text",
189
+ "source": {
190
+ "distribution": "image-files",
191
+ "extract": {
192
+ "fileProperty": "filename"
193
+ }
194
+ }
195
+ },
196
+ {
197
+ "@type": "ml:Field",
198
+ "name": "image_content",
199
+ "description": "The content of the image.",
200
+ "dataType": "sc:ImageObject",
201
+ "source": {
202
+ "distribution": "image-files",
203
+ "extract": {
204
+ "fileProperty": "content"
205
+ }
206
+ }
207
+ },
208
+ {
209
+ "@type": "ml:Field",
210
+ "name": "split",
211
+ "dataType": [
212
+ "sc:Text",
213
+ "wd:Q3985153"
214
+ ],
215
+ "references": {
216
+ "field": "split_enums/name"
217
+ },
218
+ "source": {
219
+ "distribution": "image-files",
220
+ "extract": {
221
+ "fileProperty": "fullpath"
222
+ },
223
+ "transform": {
224
+ "regex": "^(train|val|test)2014/.*\\.jpg$"
225
+ }
226
+ }
227
+ }
228
+ ]
229
+ },
230
+ {
231
+ "@type": "ml:RecordSet",
232
+ "name": "captions",
233
+ "key": "id",
234
+ "field": [
235
+ {
236
+ "@type": "ml:Field",
237
+ "name": "id",
238
+ "description": "The ID of the caption",
239
+ "dataType": "sc:Integer",
240
+ "source": {
241
+ "distribution": "caption_annotations-files",
242
+ "extract": {
243
+ "column": "id"
244
+ }
245
+ }
246
+ },
247
+ {
248
+ "@type": "ml:Field",
249
+ "name": "image_id",
250
+ "description": "The ID of the image",
251
+ "dataType": "sc:Integer",
252
+ "source": {
253
+ "distribution": "caption_annotations-files",
254
+ "extract": {
255
+ "column": "image_id"
256
+ }
257
+ }
258
+ },
259
+ {
260
+ "@type": "ml:Field",
261
+ "name": "caption",
262
+ "description": "The caption",
263
+ "dataType": [
264
+ "sc:Text",
265
+ "wd:Q18585177"
266
+ ],
267
+ "source": {
268
+ "distribution": "caption_annotations-files",
269
+ "extract": {
270
+ "column": "caption"
271
+ }
272
+ }
273
+ },
274
+ {
275
+ "@type": "ml:Field",
276
+ "name": "split",
277
+ "dataType": [
278
+ "sc:Text",
279
+ "wd:Q3985153"
280
+ ],
281
+ "references": {
282
+ "field": "split_enums/name"
283
+ },
284
+ "source": {
285
+ "distribution": "caption_annotations-files",
286
+ "extract": {
287
+ "fileProperty": "filename"
288
+ },
289
+ "transform": {
290
+ "regex": ".*_(val|train)2014\\.json$"
291
+ }
292
+ }
293
+ }
294
+ ]
295
+ },
296
+ {
297
+ "@type": "ml:RecordSet",
298
+ "name": "categories",
299
+ "isEnumeration": true,
300
+ "key": "id",
301
+ "field": [
302
+ {
303
+ "@type": "ml:Field",
304
+ "name": "id",
305
+ "description": "The ID of the category",
306
+ "dataType": "sc:Integer",
307
+ "source": {
308
+ "distribution": "instancesperson_keypoints_annotations",
309
+ "extract": {
310
+ "column": "id"
311
+ }
312
+ }
313
+ },
314
+ {
315
+ "@type": "ml:Field",
316
+ "name": "name",
317
+ "description": "The name of the category.",
318
+ "dataType": [
319
+ "sc:Text",
320
+ "sc:name"
321
+ ],
322
+ "source": {
323
+ "distribution": "instancesperson_keypoints_annotations",
324
+ "extract": {
325
+ "column": "name"
326
+ }
327
+ }
328
+ },
329
+ {
330
+ "@type": "ml:Field",
331
+ "name": "supercategory",
332
+ "description": "The name of the supercategory.",
333
+ "dataType": [
334
+ "sc:Text",
335
+ "sc:name"
336
+ ],
337
+ "isEnumeration": true,
338
+ "source": {
339
+ "distribution": "instancesperson_keypoints_annotations",
340
+ "extract": {
341
+ "column": "supercategory"
342
+ }
343
+ }
344
+ }
345
+ ]
346
+ },
347
+ {
348
+ "@type": "ml:RecordSet",
349
+ "name": "annotations",
350
+ "key": "id",
351
+ "field": [
352
+ {
353
+ "@type": "ml:Field",
354
+ "name": "id",
355
+ "description": "The ID of the annotation.",
356
+ "dataType": "sc:Integer",
357
+ "source": {
358
+ "distribution": "instancesperson_keypoints_annotations",
359
+ "extract": {
360
+ "column": "id"
361
+ }
362
+ }
363
+ },
364
+ {
365
+ "@type": "ml:Field",
366
+ "name": "category_id",
367
+ "description": "The ID of the category.",
368
+ "dataType": "sc:Integer",
369
+ "references": {
370
+ "field": "categories/id"
371
+ },
372
+ "source": {
373
+ "distribution": "instancesperson_keypoints_annotations",
374
+ "extract": {
375
+ "column": "category_id"
376
+ }
377
+ }
378
+ },
379
+ {
380
+ "@type": "ml:Field",
381
+ "name": "image_id",
382
+ "description": "The ID of the image.",
383
+ "dataType": "sc:Integer",
384
+ "references": {
385
+ "field": "images/image_id"
386
+ },
387
+ "source": {
388
+ "distribution": "instancesperson_keypoints_annotations",
389
+ "extract": {
390
+ "column": "image_id"
391
+ }
392
+ }
393
+ },
394
+ {
395
+ "@type": "ml:Field",
396
+ "name": "bbox",
397
+ "description": "The bounding box around annotated object[s].",
398
+ "dataType": "ml:BoundingBox",
399
+ "source": {
400
+ "distribution": "instancesperson_keypoints_annotations",
401
+ "extract": {
402
+ "column": "bbox"
403
+ }
404
+ }
405
+ }
406
+ ]
407
+ }
408
+ ]
409
+ }
cypress/fixtures/titanic.json ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "@context": {
3
+ "@language": "en",
4
+ "@vocab": "https://schema.org/",
5
+ "column": "ml:column",
6
+ "data": {
7
+ "@id": "ml:data",
8
+ "@type": "@json"
9
+ },
10
+ "dataType": {
11
+ "@id": "ml:dataType",
12
+ "@type": "@vocab"
13
+ },
14
+ "extract": "ml:extract",
15
+ "field": "ml:field",
16
+ "fileProperty": "ml:fileProperty",
17
+ "format": "ml:format",
18
+ "includes": "ml:includes",
19
+ "isEnumeration": "ml:isEnumeration",
20
+ "jsonPath": "ml:jsonPath",
21
+ "ml": "http://mlcommons.org/schema/",
22
+ "parentField": "ml:parentField",
23
+ "path": "ml:path",
24
+ "recordSet": "ml:recordSet",
25
+ "references": "ml:references",
26
+ "regex": "ml:regex",
27
+ "repeated": "ml:repeated",
28
+ "replace": "ml:replace",
29
+ "sc": "https://schema.org/",
30
+ "separator": "ml:separator",
31
+ "source": "ml:source",
32
+ "subField": "ml:subField",
33
+ "transform": "ml:transform",
34
+ "wd": "https://www.wikidata.org/wiki/"
35
+ },
36
+ "@type": "sc:Dataset",
37
+ "name": "Titanic",
38
+ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n",
39
+ "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n",
40
+ "license": "Public",
41
+ "url": "https://www.openml.org/d/40945",
42
+ "distribution": [
43
+ {
44
+ "@type": "sc:FileObject",
45
+ "name": "passengers.csv",
46
+ "contentSize": "117743 B",
47
+ "contentUrl": "https://www.openml.org/data/get_csv/16826755/phpMYEkMl",
48
+ "encodingFormat": "text/csv",
49
+ "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
50
+ },
51
+ {
52
+ "@type": "sc:FileObject",
53
+ "name": "genders.csv",
54
+ "description": "Maps gender values (\"male\", \"female\") to semantic URLs.",
55
+ "contentSize": "117743 B",
56
+ "contentUrl": "data/genders.csv",
57
+ "encodingFormat": "text/csv",
58
+ "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
59
+ },
60
+ {
61
+ "@type": "sc:FileObject",
62
+ "name": "embarkation_ports.csv",
63
+ "description": "Maps Embarkation port initial to labeled values.",
64
+ "contentSize": "117743 B",
65
+ "contentUrl": "data/embarkation_ports.csv",
66
+ "encodingFormat": "text/csv",
67
+ "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
68
+ }
69
+ ],
70
+ "recordSet": [
71
+ {
72
+ "@type": "ml:RecordSet",
73
+ "name": "genders",
74
+ "description": "Maps gender labels to semantic definitions.",
75
+ "isEnumeration": true,
76
+ "key": "label",
77
+ "field": [
78
+ {
79
+ "@type": "ml:Field",
80
+ "name": "label",
81
+ "description": "One of {\"male\", \"female\"}",
82
+ "dataType": [
83
+ "sc:Text",
84
+ "sc:name"
85
+ ],
86
+ "source": {
87
+ "distribution": "genders.csv",
88
+ "extract": {
89
+ "column": "label"
90
+ }
91
+ }
92
+ },
93
+ {
94
+ "@type": "ml:Field",
95
+ "name": "url",
96
+ "description": "Corresponding WikiData URL",
97
+ "dataType": [
98
+ "sc:URL",
99
+ "wd:Q48277"
100
+ ],
101
+ "source": {
102
+ "distribution": "genders.csv",
103
+ "extract": {
104
+ "column": "url"
105
+ }
106
+ }
107
+ }
108
+ ]
109
+ },
110
+ {
111
+ "@type": "ml:RecordSet",
112
+ "name": "embarkation_ports",
113
+ "description": "Maps Embarkation port initial to labeled values.",
114
+ "isEnumeration": true,
115
+ "key": "key",
116
+ "field": [
117
+ {
118
+ "@type": "ml:Field",
119
+ "name": "key",
120
+ "description": "C, Q, S or ?",
121
+ "dataType": "sc:Text",
122
+ "source": {
123
+ "distribution": "embarkation_ports.csv",
124
+ "extract": {
125
+ "column": "key"
126
+ }
127
+ }
128
+ },
129
+ {
130
+ "@type": "ml:Field",
131
+ "name": "label",
132
+ "description": "Human-readable label",
133
+ "dataType": [
134
+ "sc:Text",
135
+ "sc:name"
136
+ ],
137
+ "source": {
138
+ "distribution": "embarkation_ports.csv",
139
+ "extract": {
140
+ "column": "label"
141
+ }
142
+ }
143
+ },
144
+ {
145
+ "@type": "ml:Field",
146
+ "name": "url",
147
+ "description": "Corresponding WikiData URL",
148
+ "dataType": [
149
+ "sc:URL",
150
+ "wd:Q515"
151
+ ],
152
+ "source": {
153
+ "distribution": "embarkation_ports.csv",
154
+ "extract": {
155
+ "column": "url"
156
+ }
157
+ }
158
+ }
159
+ ]
160
+ },
161
+ {
162
+ "@type": "ml:RecordSet",
163
+ "name": "passengers",
164
+ "description": "The list of passengers. Does not include crew members.",
165
+ "field": [
166
+ {
167
+ "@type": "ml:Field",
168
+ "name": "name",
169
+ "description": "Name of the passenger",
170
+ "dataType": "sc:Text",
171
+ "source": {
172
+ "distribution": "passengers.csv",
173
+ "extract": {
174
+ "column": "name"
175
+ }
176
+ }
177
+ },
178
+ {
179
+ "@type": "ml:Field",
180
+ "name": "gender",
181
+ "description": "Gender of passenger (male or female)",
182
+ "dataType": "sc:Text",
183
+ "references": {
184
+ "field": "genders/label"
185
+ },
186
+ "source": {
187
+ "distribution": "passengers.csv",
188
+ "extract": {
189
+ "column": "sex"
190
+ }
191
+ }
192
+ },
193
+ {
194
+ "@type": "ml:Field",
195
+ "name": "age",
196
+ "description": "Age of passenger at time of death. It's a string, because some values can be `?`.",
197
+ "dataType": "sc:Text",
198
+ "source": {
199
+ "distribution": "passengers.csv",
200
+ "extract": {
201
+ "column": "age"
202
+ }
203
+ }
204
+ },
205
+ {
206
+ "@type": "ml:Field",
207
+ "name": "survived",
208
+ "description": "Survival status of passenger (0: Lost, 1: Saved)",
209
+ "dataType": "sc:Integer",
210
+ "source": {
211
+ "distribution": "passengers.csv",
212
+ "extract": {
213
+ "column": "survived"
214
+ }
215
+ }
216
+ },
217
+ {
218
+ "@type": "ml:Field",
219
+ "name": "pclass",
220
+ "description": "Passenger Class (1st/2nd/3rd)",
221
+ "dataType": "sc:Integer",
222
+ "source": {
223
+ "distribution": "passengers.csv",
224
+ "extract": {
225
+ "column": "pclass"
226
+ }
227
+ }
228
+ },
229
+ {
230
+ "@type": "ml:Field",
231
+ "name": "cabin",
232
+ "description": "Passenger cabin.",
233
+ "dataType": "sc:Text",
234
+ "source": {
235
+ "distribution": "passengers.csv",
236
+ "extract": {
237
+ "column": "cabin"
238
+ }
239
+ }
240
+ },
241
+ {
242
+ "@type": "ml:Field",
243
+ "name": "embarked",
244
+ "description": "Port of Embarkation (C: Cherbourg, Q: Queenstown, S: Southampton, ?: Unknown).",
245
+ "dataType": "sc:Text",
246
+ "references": {
247
+ "field": "embarkation_ports/key"
248
+ },
249
+ "source": {
250
+ "distribution": "passengers.csv",
251
+ "extract": {
252
+ "column": "embarked"
253
+ }
254
+ }
255
+ },
256
+ {
257
+ "@type": "ml:Field",
258
+ "name": "fare",
259
+ "description": "Passenger Fare (British pound). It's a string, because some values can be `?`.",
260
+ "dataType": "sc:Text",
261
+ "source": {
262
+ "distribution": "passengers.csv",
263
+ "extract": {
264
+ "column": "fare"
265
+ }
266
+ }
267
+ },
268
+ {
269
+ "@type": "ml:Field",
270
+ "name": "home_destination",
271
+ "description": "Home and destination",
272
+ "dataType": "sc:Text",
273
+ "source": {
274
+ "distribution": "passengers.csv",
275
+ "extract": {
276
+ "column": "home.dest"
277
+ }
278
+ }
279
+ },
280
+ {
281
+ "@type": "ml:Field",
282
+ "name": "ticket",
283
+ "description": "Ticket Number, may include a letter.",
284
+ "dataType": "sc:Text",
285
+ "source": {
286
+ "distribution": "passengers.csv",
287
+ "extract": {
288
+ "column": "ticket"
289
+ }
290
+ }
291
+ },
292
+ {
293
+ "@type": "ml:Field",
294
+ "name": "num_parents_children",
295
+ "description": "Number of Parents/Children Aboard",
296
+ "dataType": "sc:Integer",
297
+ "source": {
298
+ "distribution": "passengers.csv",
299
+ "extract": {
300
+ "column": "parch"
301
+ }
302
+ }
303
+ },
304
+ {
305
+ "@type": "ml:Field",
306
+ "name": "num_siblings_spouses",
307
+ "description": "Number of Siblings/Spouses Aboard",
308
+ "dataType": "sc:Integer",
309
+ "source": {
310
+ "distribution": "passengers.csv",
311
+ "extract": {
312
+ "column": "sibsp"
313
+ }
314
+ }
315
+ },
316
+ {
317
+ "@type": "ml:Field",
318
+ "name": "boat",
319
+ "description": "Lifeboat used by passenger",
320
+ "dataType": "sc:Text",
321
+ "source": {
322
+ "distribution": "passengers.csv",
323
+ "extract": {
324
+ "column": "boat"
325
+ }
326
+ }
327
+ },
328
+ {
329
+ "@type": "ml:Field",
330
+ "name": "body",
331
+ "description": "Body Identification Number",
332
+ "dataType": "sc:Text",
333
+ "source": {
334
+ "distribution": "passengers.csv",
335
+ "extract": {
336
+ "column": "body"
337
+ }
338
+ }
339
+ }
340
+ ]
341
+ }
342
+ ]
343
+ }
cypress/screenshots/uploadCsv.cy.js/Editor loads a local CSV as a resource -- should display the form Overview, Metadata, Resources, & Record Sets (failed).png ADDED
cypress/support/e2e.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ import "./resize_observer"
3
+
4
+ beforeEach(() => {
5
+ cy.ignore_resize_observer();
6
+ })
cypress/support/resize_observer.js ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Cypress.Commands.add("ignore_resize_observer", () => {
2
+ const resizeObserverLoopErrRe = /ResizeObserver loop limit exceeded/
3
+
4
+ // consensus was that this exception didn't matter mostly, and was intermittent when running tests.
5
+ // https://stackoverflow.com/questions/63653605/resizeobserver-loop-limit-exceeded-api-is-never-used
6
+ Cypress.on('uncaught:exception', err => {
7
+ if (resizeObserverLoopErrRe.test(err.message)) {
8
+ return false
9
+ }
10
+ })
11
+ })
events/__init__.py ADDED
File without changes
events/fields.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+ from typing import Any
3
+
4
+ import streamlit as st
5
+
6
+ from core.state import Field
7
+ from core.state import Metadata
8
+ import mlcroissant as mlc
9
+
10
+
11
+ class ExtractType:
12
+ """The type of extraction to perform."""
13
+
14
+ COLUMN = "Column"
15
+ JSON_PATH = "JSON path"
16
+ FILE_CONTENT = "File content"
17
+ FILE_NAME = "File name"
18
+ FILE_PATH = "File path"
19
+ FILE_FULLPATH = "Full path"
20
+ FILE_LINES = "Lines in file"
21
+ FILE_LINE_NUMBERS = "Line numbers in file"
22
+
23
+
24
+ class TransformType:
25
+ """The type of transformation to perform."""
26
+
27
+ FORMAT = "Apply format"
28
+ JSON_PATH = "Apply JSON path"
29
+ REGEX = "Apply regular expression"
30
+ REPLACE = "Replace"
31
+ SEPARATOR = "Separator"
32
+
33
+
34
+ def _get_source(source: mlc.Source | None, value: Any) -> mlc.Source:
35
+ if not source:
36
+ source = mlc.Source(extract=mlc.Extract())
37
+ if value == ExtractType.COLUMN:
38
+ source.extract = mlc.Extract(column="")
39
+ elif value == ExtractType.FILE_CONTENT:
40
+ source.extract = mlc.Extract(file_property=mlc.FileProperty.content)
41
+ elif value == ExtractType.FILE_NAME:
42
+ source.extract = mlc.Extract(file_property=mlc.FileProperty.filename)
43
+ elif value == ExtractType.FILE_PATH:
44
+ source.extract = mlc.Extract(file_property=mlc.FileProperty.filepath)
45
+ elif value == ExtractType.FILE_FULLPATH:
46
+ source.extract = mlc.Extract(file_property=mlc.FileProperty.fullpath)
47
+ elif value == ExtractType.FILE_LINES:
48
+ source.extract = mlc.Extract(file_property=mlc.FileProperty.lines)
49
+ elif value == ExtractType.FILE_LINE_NUMBERS:
50
+ source.extract = mlc.Extract(file_property=mlc.FileProperty.lineNumbers)
51
+ elif value == ExtractType.JSON_PATH:
52
+ source.extract = mlc.Extract(json_path="")
53
+ return source
54
+
55
+
56
+ class FieldEvent(enum.Enum):
57
+ """Event that triggers a field change."""
58
+
59
+ NAME = "NAME"
60
+ DESCRIPTION = "DESCRIPTION"
61
+ DATA_TYPE = "DATA_TYPE"
62
+ SOURCE = "SOURCE"
63
+ SOURCE_EXTRACT = "SOURCE_EXTRACT"
64
+ SOURCE_EXTRACT_COLUMN = "SOURCE_EXTRACT_COLUMN"
65
+ SOURCE_EXTRACT_JSON_PATH = "SOURCE_EXTRACT_JSON_PATH"
66
+ TRANSFORM = "TRANSFORM"
67
+ TRANSFORM_FORMAT = "TRANSFORM_FORMAT"
68
+ REFERENCE = "REFERENCE"
69
+ REFERENCE_EXTRACT = "REFERENCE_EXTRACT"
70
+ REFERENCE_EXTRACT_COLUMN = "REFERENCE_EXTRACT_COLUMN"
71
+ REFERENCE_EXTRACT_JSON_PATH = "REFERENCE_EXTRACT_JSON_PATH"
72
+
73
+
74
+ def handle_field_change(
75
+ change: FieldEvent,
76
+ field: Field,
77
+ key: str,
78
+ **kwargs,
79
+ ):
80
+ value = st.session_state[key]
81
+ if change == FieldEvent.NAME:
82
+ old_name = field.name
83
+ new_name = value
84
+ if old_name != new_name:
85
+ metadata: Metadata = st.session_state[Metadata]
86
+ metadata.rename_field(old_name=old_name, new_name=new_name)
87
+ field.name = value
88
+ elif change == FieldEvent.DESCRIPTION:
89
+ field.description = value
90
+ elif change == FieldEvent.DATA_TYPE:
91
+ field.data_types = [value]
92
+ elif change == FieldEvent.SOURCE:
93
+ node_type = "field" if "/" in value else "distribution"
94
+ source = mlc.Source(uid=value, node_type=node_type)
95
+ field.source = source
96
+ elif change == FieldEvent.SOURCE_EXTRACT:
97
+ source = field.source
98
+ source = _get_source(source, value)
99
+ field.source = source
100
+ elif change == FieldEvent.SOURCE_EXTRACT_COLUMN:
101
+ if not field.source:
102
+ field.source = mlc.Source(extract=mlc.Extract())
103
+ field.source.extract = mlc.Extract(column=value)
104
+ elif change == FieldEvent.SOURCE_EXTRACT_JSON_PATH:
105
+ if not field.source:
106
+ field.source = mlc.Source(extract=mlc.Extract())
107
+ field.source.extract = mlc.Extract(json_path=value)
108
+ elif change == FieldEvent.TRANSFORM:
109
+ number = kwargs.get("number")
110
+ if number is not None and number < len(field.source.transforms):
111
+ field.source.transforms[number] = mlc.Transform()
112
+ elif change == TransformType.FORMAT:
113
+ number = kwargs.get("number")
114
+ if number is not None and number < len(field.source.transforms):
115
+ field.source.transforms[number] = mlc.Transform(format=value)
116
+ elif change == TransformType.JSON_PATH:
117
+ number = kwargs.get("number")
118
+ if number is not None and number < len(field.source.transforms):
119
+ field.source.transforms[number] = mlc.Transform(json_path=value)
120
+ elif change == TransformType.REGEX:
121
+ number = kwargs.get("number")
122
+ if number is not None and number < len(field.source.transforms):
123
+ field.source.transforms[number] = mlc.Transform(regex=value)
124
+ elif change == TransformType.REPLACE:
125
+ number = kwargs.get("number")
126
+ if number is not None and number < len(field.source.transforms):
127
+ field.source.transforms[number] = mlc.Transform(replace=value)
128
+ elif change == TransformType.SEPARATOR:
129
+ number = kwargs.get("number")
130
+ if number is not None and number < len(field.source.transforms):
131
+ field.source.transforms[number] = mlc.Transform(separator=value)
132
+ elif change == FieldEvent.REFERENCE:
133
+ node_type = "field" if "/" in value else "distribution"
134
+ source = mlc.Source(uid=value, node_type=node_type)
135
+ field.references = source
136
+ elif change == FieldEvent.REFERENCE_EXTRACT:
137
+ source = field.references
138
+ source = _get_source(source, value)
139
+ field.references = source
140
+ elif change == FieldEvent.REFERENCE_EXTRACT_COLUMN:
141
+ if not field.references:
142
+ field.references = mlc.Source(extract=mlc.Extract())
143
+ field.references.extract = mlc.Extract(column=value)
144
+ elif change == FieldEvent.REFERENCE_EXTRACT_JSON_PATH:
145
+ if not field.references:
146
+ field.references = mlc.Source(extract=mlc.Extract())
147
+ field.references.extract = mlc.Extract(json_path=value)
events/metadata.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+
3
+ import streamlit as st
4
+
5
+ from core.state import Metadata
6
+
7
+
8
+ class MetadataEvent(enum.Enum):
9
+ """Event that triggers a metadata change."""
10
+
11
+ NAME = "NAME"
12
+ DESCRIPTION = "DESCRIPTION"
13
+ URL = "URL"
14
+ LICENSE = "LICENSE"
15
+ CITATION = "CITATION"
16
+
17
+
18
+ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
19
+ if event == MetadataEvent.NAME:
20
+ metadata.name = st.session_state[key]
21
+ elif event == MetadataEvent.DESCRIPTION:
22
+ metadata.description = st.session_state[key]
23
+ elif event == MetadataEvent.LICENSE:
24
+ metadata.license = st.session_state[key]
25
+ elif event == MetadataEvent.CITATION:
26
+ metadata.citation = st.session_state[key]
27
+ elif event == MetadataEvent.URL:
28
+ metadata.url = st.session_state[key]
events/record_sets.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+
3
+ import streamlit as st
4
+
5
+ from core.state import Metadata
6
+ from core.state import RecordSet
7
+
8
+
9
+ class RecordSetEvent(enum.Enum):
10
+ """Event that triggers a RecordSet change."""
11
+
12
+ NAME = "NAME"
13
+ DESCRIPTION = "DESCRIPTION"
14
+ IS_ENUMERATION = "IS_ENUMERATION"
15
+
16
+
17
+ def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key: str):
18
+ value = st.session_state[key]
19
+ if event == RecordSetEvent.NAME:
20
+ old_name = record_set.name
21
+ new_name = value
22
+ if old_name != new_name:
23
+ metadata: Metadata = st.session_state[Metadata]
24
+ metadata.rename_record_set(old_name=old_name, new_name=new_name)
25
+ record_set.name = value
26
+ elif event == RecordSetEvent.DESCRIPTION:
27
+ record_set.description = value
28
+ elif event == RecordSetEvent.IS_ENUMERATION:
29
+ record_set.is_enumeration = value
events/resources.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import enum
2
+
3
+ import streamlit as st
4
+
5
+ from core.state import FileObject
6
+ from core.state import FileSet
7
+ from core.state import Metadata
8
+
9
+ Resource = FileObject | FileSet
10
+
11
+
12
+ class ResourceEvent(enum.Enum):
13
+ """Event that triggers a resource change."""
14
+
15
+ NAME = "NAME"
16
+ DESCRIPTION = "DESCRIPTION"
17
+ ENCODING_FORMAT = "ENCODING_FORMAT"
18
+ SHA256 = "SHA256"
19
+ CONTENT_SIZE = "CONTENT_SIZE"
20
+ CONTENT_URL = "CONTENT_URL"
21
+
22
+
23
+ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
24
+ value = st.session_state[key]
25
+ if event == ResourceEvent.NAME:
26
+ old_name = resource.name
27
+ new_name = value
28
+ if old_name != new_name:
29
+ metadata: Metadata = st.session_state[Metadata]
30
+ metadata.rename_distribution(old_name=old_name, new_name=new_name)
31
+ resource.name = value
32
+ elif event == ResourceEvent.DESCRIPTION:
33
+ resource.description = value
34
+ elif event == ResourceEvent.ENCODING_FORMAT:
35
+ resource.encoding_format = value
36
+ elif event == ResourceEvent.SHA256:
37
+ resource.sha256 = value
38
+ elif event == ResourceEvent.CONTENT_SIZE:
39
+ resource.content_size = value
40
+ elif event == ResourceEvent.CONTENT_URL:
41
+ resource.content_url = value