Updated with Loading and Cleaning dataset

2024-02-25 11:28:34 +08:00 · 2024-02-25 11:28:34 +08:00 · 3ca3e6b16f
parent cd1abfbd9f
commit 3ca3e6b16f
1 changed files with 56 additions and 60 deletions
--- a/Project.ipynb
+++ b/Project.ipynb
@ -16,25 +16,26 @@
   "source": [
    "import os\n",
    "\n",
-    "# Importing the libraries\n",
-    "import pandas as pd\n",
-    "\n",
    "DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:46.399745088Z",
-     "start_time": "2024-02-25T02:29:46.386566147Z"
+     "end_time": "2024-02-25T03:26:58.464846949Z",
+     "start_time": "2024-02-25T03:26:58.415028614Z"
    }
   },
   "id": "bcd6cbaa5df10ce8",
-   "execution_count": 39
+   "execution_count": 73
  },
  {
   "cell_type": "markdown",
   "source": [
-    "# Load the data into a pandas dataframe"
+    "# Load the data into a pandas dataframe\n",
+    "\n",
+    "The first step in any data analysis project is to load the data into a suitable data structure. In this case, we will use the `pandas` library to load the data into a dataframe.\n",
+    "\n",
+    "We then clean the data by handling missing values, removing duplicates, converting data types, and performing outlier detection and removal. "
   ],
   "metadata": {
    "collapsed": false
@ -48,69 +49,61 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \\\n",
-      "0   1.0   6.18   749.0   4889.0  13876.0  10464.0        240.0   9048.0   \n",
-      "1   1.0   4.54   741.0   2474.0   2002.0   1593.0         68.0   6514.0   \n",
-      "2   1.0   4.39   744.0   1934.0   2615.0   4114.0         52.0   2880.0   \n",
-      "3   1.0   1.27   748.0  16031.0  17712.0  10420.0         64.0  12855.0   \n",
-      "4   0.0   1.16   743.0  20070.0  19886.0  15727.0         76.0  11607.0   \n",
-      "\n",
-      "   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \\\n",
-      "0     3668.0  1024.0  ...    818.0    938.0    588.0    277.0    727.0   \n",
-      "1     1031.0  1024.0  ...    289.0    228.0    107.0    487.0    491.0   \n",
-      "2      796.0  1024.0  ...    123.0    281.0    483.0     97.0    272.0   \n",
-      "3     1529.0   323.0  ...    169.0    138.0    219.0     94.0    225.0   \n",
-      "4     2022.0   296.0  ...     87.0     43.0    358.0    308.0    132.0   \n",
-      "\n",
-      "   CIR1011  CIR1012  CIR1013  CIR1014  CIR1015  \n",
-      "0    367.0    803.0    819.0    467.0    768.0  \n",
-      "1    404.0    334.0    210.0    102.0      0.0  \n",
-      "2     73.0    125.0    169.0    182.0      0.0  \n",
-      "3    155.0    172.0    278.0    318.0      0.0  \n",
-      "4    131.0    102.0    126.0    163.0      0.0  \n",
-      "\n",
-      "[5 rows x 1031 columns]\n",
-      "Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n",
-      "       'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n",
-      "       ...\n",
-      "       'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n",
-      "       'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n",
-      "      dtype='object', length=1031)\n",
-      "No missing values\n"
+      "Original data shape: (42000, 1031)\n",
+      "Cleaned data shape: (42000, 1031)\n"
     ]
    }
   ],
   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from scipy import stats\n",
+    "\n",
+    "\n",
    "def load_data(dataset_dir):\n",
-    "    # Get all file paths in the directory\n",
+    "    # Load the data\n",
    "    file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
-    "\n",
-    "    # Load and concatenate all dataframes\n",
-    "    data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n",
-    "\n",
+    "    data = pd.concat((pd.read_csv(file_path) for file_path in file_paths))\n",
+    "    print(f\"Original data shape: {data.shape}\")\n",
    "    return data\n",
    "\n",
    "\n",
-    "data = load_data(DATASET_DIR)\n",
+    "def clean_data(data):\n",
+    "    # Handle missing values\n",
+    "    data = data.dropna()\n",
    "\n",
-    "print(data.head())\n",
+    "    # Remove duplicates\n",
+    "    data = data.drop_duplicates()\n",
+    "\n",
+    "    # Convert data types\n",
+    "    data['NLOS'] = data['NLOS'].astype(int)\n",
+    "\n",
+    "    # Outlier detection and removal\n",
+    "    z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))\n",
+    "    data = data[(z_scores < 3).any(axis=1)]\n",
+    "\n",
+    "    print(f\"Cleaned data shape: {data.shape}\")\n",
+    "    return data\n",
+    "\n",
+    "\n",
+    "# Use the functions\n",
+    "data = load_data(DATASET_DIR)\n",
+    "data = clean_data(data)\n",
+    "\n",
+    "# print(data.head())\n",
    "\n",
    "# Print Headers\n",
-    "print(data.columns)\n",
-    "\n",
-    "# Check that there are no missing values\n",
-    "assert data.isnull().sum().sum() == 0\n",
-    "print(\"No missing values\")\n"
+    "# print(data.columns)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:51.084821398Z",
-     "start_time": "2024-02-25T02:29:46.405675293Z"
+     "end_time": "2024-02-25T03:27:06.334698247Z",
+     "start_time": "2024-02-25T03:26:58.458307532Z"
    }
   },
   "id": "dd9657f5ec6d7754",
-   "execution_count": 40
+   "execution_count": 74
  },
  {
   "cell_type": "markdown",
@ -164,16 +157,19 @@
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
+    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Standardize the data\n",
-    "data_std = StandardScaler().fit_transform(data)\n",
+    "numerical_cols = data.select_dtypes(include=[np.number]).columns\n",
+    "scaler = StandardScaler()\n",
+    "data[numerical_cols] = scaler.fit_transform(data[numerical_cols])\n",
    "\n",
    "# Initialize PCA with the desired explained variance\n",
    "pca = PCA(0.95)\n",
    "\n",
    "# Fit PCA to your data\n",
-    "pca.fit(data_std)\n",
+    "pca.fit(data)\n",
    "\n",
    "# Get the number of components\n",
    "num_components = pca.n_components_\n",
@ -183,12 +179,12 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:58.267018142Z",
-     "start_time": "2024-02-25T02:29:51.084440279Z"
+     "end_time": "2024-02-25T03:27:13.639843012Z",
+     "start_time": "2024-02-25T03:27:06.336830842Z"
    }
   },
   "id": "7f9bec73a42f7bca",
-   "execution_count": 41
+   "execution_count": 75
  },
  {
   "cell_type": "markdown",
@ -217,7 +213,7 @@
   ],
   "source": [
    "# Project original data to PC with the highest eigenvalue\n",
-    "data_pca = pca.transform(data_std)\n",
+    "data_pca = pca.transform(data)\n",
    "\n",
    "# Create a dataframe with the principal components\n",
    "data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
@ -237,12 +233,12 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:59.029369440Z",
-     "start_time": "2024-02-25T02:29:58.266576678Z"
+     "end_time": "2024-02-25T03:27:14.422886263Z",
+     "start_time": "2024-02-25T03:27:13.660170622Z"
    }
   },
   "id": "96c62c50f8734a01",
-   "execution_count": 42
+   "execution_count": 76
  }
 ],
 "metadata": {