From 3ca3e6b16f35c98cd3931a261731b1c905a93610 Mon Sep 17 00:00:00 2001 From: devoalda Date: Sun, 25 Feb 2024 11:28:34 +0800 Subject: [PATCH] Updated with Loading and Cleaning dataset --- Project.ipynb | 116 ++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 60 deletions(-) diff --git a/Project.ipynb b/Project.ipynb index 68be8cd..89d48a9 100644 --- a/Project.ipynb +++ b/Project.ipynb @@ -16,25 +16,26 @@ "source": [ "import os\n", "\n", - "# Importing the libraries\n", - "import pandas as pd\n", - "\n", "DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-02-25T02:29:46.399745088Z", - "start_time": "2024-02-25T02:29:46.386566147Z" + "end_time": "2024-02-25T03:26:58.464846949Z", + "start_time": "2024-02-25T03:26:58.415028614Z" } }, "id": "bcd6cbaa5df10ce8", - "execution_count": 39 + "execution_count": 73 }, { "cell_type": "markdown", "source": [ - "# Load the data into a pandas dataframe" + "# Load the data into a pandas dataframe\n", + "\n", + "The first step in any data analysis project is to load the data into a suitable data structure. In this case, we will use the `pandas` library to load the data into a dataframe.\n", + "\n", + "We then clean the data by handling missing values, removing duplicates, converting data types, and performing outlier detection and removal. " ], "metadata": { "collapsed": false @@ -48,69 +49,61 @@ "name": "stdout", "output_type": "stream", "text": [ - " NLOS RANGE FP_IDX FP_AMP1 FP_AMP2 FP_AMP3 STDEV_NOISE CIR_PWR \\\n", - "0 1.0 6.18 749.0 4889.0 13876.0 10464.0 240.0 9048.0 \n", - "1 1.0 4.54 741.0 2474.0 2002.0 1593.0 68.0 6514.0 \n", - "2 1.0 4.39 744.0 1934.0 2615.0 4114.0 52.0 2880.0 \n", - "3 1.0 1.27 748.0 16031.0 17712.0 10420.0 64.0 12855.0 \n", - "4 0.0 1.16 743.0 20070.0 19886.0 15727.0 76.0 11607.0 \n", - "\n", - " MAX_NOISE RXPACC ... CIR1006 CIR1007 CIR1008 CIR1009 CIR1010 \\\n", - "0 3668.0 1024.0 ... 818.0 938.0 588.0 277.0 727.0 \n", - "1 1031.0 1024.0 ... 289.0 228.0 107.0 487.0 491.0 \n", - "2 796.0 1024.0 ... 123.0 281.0 483.0 97.0 272.0 \n", - "3 1529.0 323.0 ... 169.0 138.0 219.0 94.0 225.0 \n", - "4 2022.0 296.0 ... 87.0 43.0 358.0 308.0 132.0 \n", - "\n", - " CIR1011 CIR1012 CIR1013 CIR1014 CIR1015 \n", - "0 367.0 803.0 819.0 467.0 768.0 \n", - "1 404.0 334.0 210.0 102.0 0.0 \n", - "2 73.0 125.0 169.0 182.0 0.0 \n", - "3 155.0 172.0 278.0 318.0 0.0 \n", - "4 131.0 102.0 126.0 163.0 0.0 \n", - "\n", - "[5 rows x 1031 columns]\n", - "Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n", - " 'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n", - " ...\n", - " 'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n", - " 'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n", - " dtype='object', length=1031)\n", - "No missing values\n" + "Original data shape: (42000, 1031)\n", + "Cleaned data shape: (42000, 1031)\n" ] } ], "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from scipy import stats\n", + "\n", + "\n", "def load_data(dataset_dir):\n", - " # Get all file paths in the directory\n", + " # Load the data\n", " file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n", - "\n", - " # Load and concatenate all dataframes\n", - " data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n", - "\n", + " data = pd.concat((pd.read_csv(file_path) for file_path in file_paths))\n", + " print(f\"Original data shape: {data.shape}\")\n", " return data\n", "\n", "\n", - "data = load_data(DATASET_DIR)\n", + "def clean_data(data):\n", + " # Handle missing values\n", + " data = data.dropna()\n", "\n", - "print(data.head())\n", + " # Remove duplicates\n", + " data = data.drop_duplicates()\n", + "\n", + " # Convert data types\n", + " data['NLOS'] = data['NLOS'].astype(int)\n", + "\n", + " # Outlier detection and removal\n", + " z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))\n", + " data = data[(z_scores < 3).any(axis=1)]\n", + "\n", + " print(f\"Cleaned data shape: {data.shape}\")\n", + " return data\n", + "\n", + "\n", + "# Use the functions\n", + "data = load_data(DATASET_DIR)\n", + "data = clean_data(data)\n", + "\n", + "# print(data.head())\n", "\n", "# Print Headers\n", - "print(data.columns)\n", - "\n", - "# Check that there are no missing values\n", - "assert data.isnull().sum().sum() == 0\n", - "print(\"No missing values\")\n" + "# print(data.columns)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-02-25T02:29:51.084821398Z", - "start_time": "2024-02-25T02:29:46.405675293Z" + "end_time": "2024-02-25T03:27:06.334698247Z", + "start_time": "2024-02-25T03:26:58.458307532Z" } }, "id": "dd9657f5ec6d7754", - "execution_count": 40 + "execution_count": 74 }, { "cell_type": "markdown", @@ -164,16 +157,19 @@ ], "source": [ "from sklearn.decomposition import PCA\n", + "\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "# Standardize the data\n", - "data_std = StandardScaler().fit_transform(data)\n", + "numerical_cols = data.select_dtypes(include=[np.number]).columns\n", + "scaler = StandardScaler()\n", + "data[numerical_cols] = scaler.fit_transform(data[numerical_cols])\n", "\n", "# Initialize PCA with the desired explained variance\n", "pca = PCA(0.95)\n", "\n", "# Fit PCA to your data\n", - "pca.fit(data_std)\n", + "pca.fit(data)\n", "\n", "# Get the number of components\n", "num_components = pca.n_components_\n", @@ -183,12 +179,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-02-25T02:29:58.267018142Z", - "start_time": "2024-02-25T02:29:51.084440279Z" + "end_time": "2024-02-25T03:27:13.639843012Z", + "start_time": "2024-02-25T03:27:06.336830842Z" } }, "id": "7f9bec73a42f7bca", - "execution_count": 41 + "execution_count": 75 }, { "cell_type": "markdown", @@ -217,7 +213,7 @@ ], "source": [ "# Project original data to PC with the highest eigenvalue\n", - "data_pca = pca.transform(data_std)\n", + "data_pca = pca.transform(data)\n", "\n", "# Create a dataframe with the principal components\n", "data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n", @@ -237,12 +233,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-02-25T02:29:59.029369440Z", - "start_time": "2024-02-25T02:29:58.266576678Z" + "end_time": "2024-02-25T03:27:14.422886263Z", + "start_time": "2024-02-25T03:27:13.660170622Z" } }, "id": "96c62c50f8734a01", - "execution_count": 42 + "execution_count": 76 } ], "metadata": {