Updated with Loading and Cleaning dataset

2024-02-25 11:28:34 +08:00 · 2024-02-25 11:28:34 +08:00 · 3ca3e6b16f
parent cd1abfbd9f
commit 3ca3e6b16f
1 changed files with 56 additions and 60 deletions
--- a/Project.ipynb
+++ b/Project.ipynb
@ -16,25 +16,26 @@
   "source": [
    "import os\n",
    "\n",
    "# Importing the libraries\n",
    "import pandas as pd\n",
    "\n",
    "DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:46.399745088Z",
+     "end_time": "2024-02-25T03:26:58.464846949Z",
-     "start_time": "2024-02-25T02:29:46.386566147Z"
+     "start_time": "2024-02-25T03:26:58.415028614Z"
    }
   },
   "id": "bcd6cbaa5df10ce8",
-   "execution_count": 39
+   "execution_count": 73
  },
  {
   "cell_type": "markdown",
   "source": [
-    "# Load the data into a pandas dataframe"
+    "# Load the data into a pandas dataframe\n",
    "\n",
    "The first step in any data analysis project is to load the data into a suitable data structure. In this case, we will use the `pandas` library to load the data into a dataframe.\n",
    "\n",
    "We then clean the data by handling missing values, removing duplicates, converting data types, and performing outlier detection and removal. "
   ],
   "metadata": {
    "collapsed": false
@ -48,69 +49,61 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \\\n",
+      "Original data shape: (42000, 1031)\n",
-      "0   1.0   6.18   749.0   4889.0  13876.0  10464.0        240.0   9048.0   \n",
+      "Cleaned data shape: (42000, 1031)\n"
      "1   1.0   4.54   741.0   2474.0   2002.0   1593.0         68.0   6514.0   \n",
      "2   1.0   4.39   744.0   1934.0   2615.0   4114.0         52.0   2880.0   \n",
      "3   1.0   1.27   748.0  16031.0  17712.0  10420.0         64.0  12855.0   \n",
      "4   0.0   1.16   743.0  20070.0  19886.0  15727.0         76.0  11607.0   \n",
      "\n",
      "   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \\\n",
      "0     3668.0  1024.0  ...    818.0    938.0    588.0    277.0    727.0   \n",
      "1     1031.0  1024.0  ...    289.0    228.0    107.0    487.0    491.0   \n",
      "2      796.0  1024.0  ...    123.0    281.0    483.0     97.0    272.0   \n",
      "3     1529.0   323.0  ...    169.0    138.0    219.0     94.0    225.0   \n",
      "4     2022.0   296.0  ...     87.0     43.0    358.0    308.0    132.0   \n",
      "\n",
      "   CIR1011  CIR1012  CIR1013  CIR1014  CIR1015  \n",
      "0    367.0    803.0    819.0    467.0    768.0  \n",
      "1    404.0    334.0    210.0    102.0      0.0  \n",
      "2     73.0    125.0    169.0    182.0      0.0  \n",
      "3    155.0    172.0    278.0    318.0      0.0  \n",
      "4    131.0    102.0    126.0    163.0      0.0  \n",
      "\n",
      "[5 rows x 1031 columns]\n",
      "Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n",
      "       'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n",
      "       ...\n",
      "       'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n",
      "       'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n",
      "      dtype='object', length=1031)\n",
      "No missing values\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "\n",
    "def load_data(dataset_dir):\n",
-    "    # Get all file paths in the directory\n",
+    "    # Load the data\n",
    "    file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
-    "\n",
+    "    data = pd.concat((pd.read_csv(file_path) for file_path in file_paths))\n",
-    "    # Load and concatenate all dataframes\n",
+    "    print(f\"Original data shape: {data.shape}\")\n",
    "    data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n",
    "\n",
    "    return data\n",
    "\n",
    "\n",
-    "data = load_data(DATASET_DIR)\n",
+    "def clean_data(data):\n",
    "    # Handle missing values\n",
    "    data = data.dropna()\n",
    "\n",
-    "print(data.head())\n",
+    "    # Remove duplicates\n",
    "    data = data.drop_duplicates()\n",
    "\n",
    "    # Convert data types\n",
    "    data['NLOS'] = data['NLOS'].astype(int)\n",
    "\n",
    "    # Outlier detection and removal\n",
    "    z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))\n",
    "    data = data[(z_scores < 3).any(axis=1)]\n",
    "\n",
    "    print(f\"Cleaned data shape: {data.shape}\")\n",
    "    return data\n",
    "\n",
    "\n",
    "# Use the functions\n",
    "data = load_data(DATASET_DIR)\n",
    "data = clean_data(data)\n",
    "\n",
    "# print(data.head())\n",
    "\n",
    "# Print Headers\n",
-    "print(data.columns)\n",
+    "# print(data.columns)"
    "\n",
    "# Check that there are no missing values\n",
    "assert data.isnull().sum().sum() == 0\n",
    "print(\"No missing values\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:51.084821398Z",
+     "end_time": "2024-02-25T03:27:06.334698247Z",
-     "start_time": "2024-02-25T02:29:46.405675293Z"
+     "start_time": "2024-02-25T03:26:58.458307532Z"
    }
   },
   "id": "dd9657f5ec6d7754",
-   "execution_count": 40
+   "execution_count": 74
  },
  {
   "cell_type": "markdown",
@ -164,16 +157,19 @@
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Standardize the data\n",
-    "data_std = StandardScaler().fit_transform(data)\n",
+    "numerical_cols = data.select_dtypes(include=[np.number]).columns\n",
    "scaler = StandardScaler()\n",
    "data[numerical_cols] = scaler.fit_transform(data[numerical_cols])\n",
    "\n",
    "# Initialize PCA with the desired explained variance\n",
    "pca = PCA(0.95)\n",
    "\n",
    "# Fit PCA to your data\n",
-    "pca.fit(data_std)\n",
+    "pca.fit(data)\n",
    "\n",
    "# Get the number of components\n",
    "num_components = pca.n_components_\n",
@ -183,12 +179,12 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:58.267018142Z",
+     "end_time": "2024-02-25T03:27:13.639843012Z",
-     "start_time": "2024-02-25T02:29:51.084440279Z"
+     "start_time": "2024-02-25T03:27:06.336830842Z"
    }
   },
   "id": "7f9bec73a42f7bca",
-   "execution_count": 41
+   "execution_count": 75
  },
  {
   "cell_type": "markdown",
@ -217,7 +213,7 @@
   ],
   "source": [
    "# Project original data to PC with the highest eigenvalue\n",
-    "data_pca = pca.transform(data_std)\n",
+    "data_pca = pca.transform(data)\n",
    "\n",
    "# Create a dataframe with the principal components\n",
    "data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
@ -237,12 +233,12 @@
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-02-25T02:29:59.029369440Z",
+     "end_time": "2024-02-25T03:27:14.422886263Z",
-     "start_time": "2024-02-25T02:29:58.266576678Z"
+     "start_time": "2024-02-25T03:27:13.660170622Z"
    }
   },
   "id": "96c62c50f8734a01",
-   "execution_count": 42
+   "execution_count": 76
  }
 ],
 "metadata": {