{ "cells": [ { "cell_type": "code", "execution_count": 106, "metadata": { "id": "f-ERaM64ONeC" }, "outputs": [], "source": [ "# preprocess csv\n", "import pandas as pd\n", "filename = '/content/U3_Metadaten.csv'\n", "df = pd.read_csv(filename, on_bad_lines='skip')" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "AYxRURTvQiFb", "outputId": "18bf4139-47ac-4939-e635-9f09f560200c" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "summary": "{\n \"name\": \"clean_df\",\n \"rows\": 158,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 158,\n \"samples\": [\n \"ISB-020-U3-W-R-01-B17012-028-000\",\n \"ISB-020-U3-W-L-01-B15100-018-000\",\n \"ISB-020-U3-W-R-01-B17012-034-000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Beschreibung\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 40,\n \"samples\": [\n \"Foto\",\n \"Bodenheizung / Ventileinstellung / FBH AB PM\",\n \"Foto - Novocon S demontiert und Stellenantriebe montiert!\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Disziplin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"D - Datennetz\",\n \"E - Elektroanlagen\",\n \"S - Sanitaer\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", "type": "dataframe", "variable_name": "clean_df" }, "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameBeschreibungDisziplin
0ISB-020-U3-W-D-01-B07005-001-000Bauarten und Stuecknachweis SGKD - Datennetz
1ISB-020-U3-W-D-01-B07005-002-000Bauarten und Stuecknachweis SGKD - Datennetz
2ISB-020-U3-W-D-01-B07005-003-000Pruefprotokoll nach DIN EN 61439-1/3D - Datennetz
3ISB-020-U3-W-D-01-B07005-004-000Pruefprotokoll nach DIN EN 61439-1/3D - Datennetz
4ISB-020-U3-W-D-01-B18012-001-000Sicherungslegende G-020 U3 779-AS 1D - Datennetz
............
153ISB-020-U3-W-S-01-B17012-008-000FotoS - Sanitaer
159ISB-020-U3-W-S-01-B17012-010-000FotoS - Sanitaer
160ISB-020-U3-W-S-01-B17012-011-000FotoS - Sanitaer
161ISB-020-U3-W-S-01-B18003-001-020Schieber / Hawle / Schieber 4000 + Handrad 780...S - Sanitaer
162ISB-020-U3-W-S-01-B19009-001-020Schieber / Hawle / 4000 SchutzraumS - Sanitaer
\n", "

158 rows × 3 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "text/plain": [ " Name \\\n", "0 ISB-020-U3-W-D-01-B07005-001-000 \n", "1 ISB-020-U3-W-D-01-B07005-002-000 \n", "2 ISB-020-U3-W-D-01-B07005-003-000 \n", "3 ISB-020-U3-W-D-01-B07005-004-000 \n", "4 ISB-020-U3-W-D-01-B18012-001-000 \n", ".. ... \n", "153 ISB-020-U3-W-S-01-B17012-008-000 \n", "159 ISB-020-U3-W-S-01-B17012-010-000 \n", "160 ISB-020-U3-W-S-01-B17012-011-000 \n", "161 ISB-020-U3-W-S-01-B18003-001-020 \n", "162 ISB-020-U3-W-S-01-B19009-001-020 \n", "\n", " Beschreibung Disziplin \n", "0 Bauarten und Stuecknachweis SGK D - Datennetz \n", "1 Bauarten und Stuecknachweis SGK D - Datennetz \n", "2 Pruefprotokoll nach DIN EN 61439-1/3 D - Datennetz \n", "3 Pruefprotokoll nach DIN EN 61439-1/3 D - Datennetz \n", "4 Sicherungslegende G-020 U3 779-AS 1 D - Datennetz \n", ".. ... ... \n", "153 Foto S - Sanitaer \n", "159 Foto S - Sanitaer \n", "160 Foto S - Sanitaer \n", "161 Schieber / Hawle / Schieber 4000 + Handrad 780... S - Sanitaer \n", "162 Schieber / Hawle / 4000 Schutzraum S - Sanitaer \n", "\n", "[158 rows x 3 columns]" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop all columns except name, description, discipline\n", "features = ['Name', 'Beschreibung', 'Disziplin']\n", "# Remove rows with NaN values\n", "clean_df = df[features].dropna()\n", "clean_df" ] }, { "cell_type": "code", "execution_count": 143, "metadata": { "id": "_PtvbAskQa72" }, "outputs": [], "source": [ "clean_df.to_csv('name-description-discipline-data.csv')" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }