sradc commited on
Commit
e79038e
1 Parent(s): e100f99

filter_images.ipynb -> manually filter videos by id

Browse files
Files changed (1) hide show
  1. _dev/filter_images.ipynb +35 -5
_dev/filter_images.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 5,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -21,17 +21,17 @@
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": 2,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
28
- "df = pd.read_parquet(DATAFRAME_PATH)\n",
29
  "dim_columns = df.filter(regex=\"^dim_\").columns"
30
  ]
31
  },
32
  {
33
  "cell_type": "code",
34
- "execution_count": 14,
35
  "metadata": {},
36
  "outputs": [
37
  {
@@ -108,9 +108,39 @@
108
  "print(len(df_))"
109
  ]
110
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  {
112
  "cell_type": "code",
113
- "execution_count": 28,
114
  "metadata": {},
115
  "outputs": [],
116
  "source": [
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": 3,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
28
+ "df = pd.read_parquet(\"../data/dataset-unfiltered.parquet\")\n",
29
  "dim_columns = df.filter(regex=\"^dim_\").columns"
30
  ]
31
  },
32
  {
33
  "cell_type": "code",
34
+ "execution_count": 4,
35
  "metadata": {},
36
  "outputs": [
37
  {
 
108
  "print(len(df_))"
109
  ]
110
  },
111
+ {
112
+ "attachments": {},
113
+ "cell_type": "markdown",
114
+ "metadata": {},
115
+ "source": [
116
+ "## Manually filter videos"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 10,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "name": "stdout",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "69692\n"
129
+ ]
130
+ }
131
+ ],
132
+ "source": [
133
+ "IDS_TO_FILTER = [\n",
134
+ " \"p7FCgw_GlWc\",\n",
135
+ "]\n",
136
+ "# remove videos with ids in IDS_TO_FILTER\n",
137
+ "df_ = df_[~df_[\"video_id\"].isin(IDS_TO_FILTER)]\n",
138
+ "print(len(df_))"
139
+ ]
140
+ },
141
  {
142
  "cell_type": "code",
143
+ "execution_count": 11,
144
  "metadata": {},
145
  "outputs": [],
146
  "source": [