Spaces:
Runtime error
Runtime error
sradc
commited on
Commit
•
e79038e
1
Parent(s):
e100f99
filter_images.ipynb -> manually filter videos by id
Browse files- _dev/filter_images.ipynb +35 -5
_dev/filter_images.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
@@ -21,17 +21,17 @@
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
-
"execution_count":
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
28 |
-
"df = pd.read_parquet(
|
29 |
"dim_columns = df.filter(regex=\"^dim_\").columns"
|
30 |
]
|
31 |
},
|
32 |
{
|
33 |
"cell_type": "code",
|
34 |
-
"execution_count":
|
35 |
"metadata": {},
|
36 |
"outputs": [
|
37 |
{
|
@@ -108,9 +108,39 @@
|
|
108 |
"print(len(df_))"
|
109 |
]
|
110 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
{
|
112 |
"cell_type": "code",
|
113 |
-
"execution_count":
|
114 |
"metadata": {},
|
115 |
"outputs": [],
|
116 |
"source": [
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
+
"execution_count": 3,
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
28 |
+
"df = pd.read_parquet(\"../data/dataset-unfiltered.parquet\")\n",
|
29 |
"dim_columns = df.filter(regex=\"^dim_\").columns"
|
30 |
]
|
31 |
},
|
32 |
{
|
33 |
"cell_type": "code",
|
34 |
+
"execution_count": 4,
|
35 |
"metadata": {},
|
36 |
"outputs": [
|
37 |
{
|
|
|
108 |
"print(len(df_))"
|
109 |
]
|
110 |
},
|
111 |
+
{
|
112 |
+
"attachments": {},
|
113 |
+
"cell_type": "markdown",
|
114 |
+
"metadata": {},
|
115 |
+
"source": [
|
116 |
+
"## Manually filter videos"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"cell_type": "code",
|
121 |
+
"execution_count": 10,
|
122 |
+
"metadata": {},
|
123 |
+
"outputs": [
|
124 |
+
{
|
125 |
+
"name": "stdout",
|
126 |
+
"output_type": "stream",
|
127 |
+
"text": [
|
128 |
+
"69692\n"
|
129 |
+
]
|
130 |
+
}
|
131 |
+
],
|
132 |
+
"source": [
|
133 |
+
"IDS_TO_FILTER = [\n",
|
134 |
+
" \"p7FCgw_GlWc\",\n",
|
135 |
+
"]\n",
|
136 |
+
"# remove videos with ids in IDS_TO_FILTER\n",
|
137 |
+
"df_ = df_[~df_[\"video_id\"].isin(IDS_TO_FILTER)]\n",
|
138 |
+
"print(len(df_))"
|
139 |
+
]
|
140 |
+
},
|
141 |
{
|
142 |
"cell_type": "code",
|
143 |
+
"execution_count": 11,
|
144 |
"metadata": {},
|
145 |
"outputs": [],
|
146 |
"source": [
|