Updated with Loading and Cleaning dataset

This commit is contained in:
devoalda 2024-02-25 11:28:34 +08:00
parent cd1abfbd9f
commit 3ca3e6b16f
1 changed files with 56 additions and 60 deletions

View File

@ -16,25 +16,26 @@
"source": [ "source": [
"import os\n", "import os\n",
"\n", "\n",
"# Importing the libraries\n",
"import pandas as pd\n",
"\n",
"DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'" "DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2024-02-25T02:29:46.399745088Z", "end_time": "2024-02-25T03:26:58.464846949Z",
"start_time": "2024-02-25T02:29:46.386566147Z" "start_time": "2024-02-25T03:26:58.415028614Z"
} }
}, },
"id": "bcd6cbaa5df10ce8", "id": "bcd6cbaa5df10ce8",
"execution_count": 39 "execution_count": 73
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"# Load the data into a pandas dataframe" "# Load the data into a pandas dataframe\n",
"\n",
"The first step in any data analysis project is to load the data into a suitable data structure. In this case, we will use the `pandas` library to load the data into a dataframe.\n",
"\n",
"We then clean the data by handling missing values, removing duplicates, converting data types, and performing outlier detection and removal. "
], ],
"metadata": { "metadata": {
"collapsed": false "collapsed": false
@ -48,69 +49,61 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" NLOS RANGE FP_IDX FP_AMP1 FP_AMP2 FP_AMP3 STDEV_NOISE CIR_PWR \\\n", "Original data shape: (42000, 1031)\n",
"0 1.0 6.18 749.0 4889.0 13876.0 10464.0 240.0 9048.0 \n", "Cleaned data shape: (42000, 1031)\n"
"1 1.0 4.54 741.0 2474.0 2002.0 1593.0 68.0 6514.0 \n",
"2 1.0 4.39 744.0 1934.0 2615.0 4114.0 52.0 2880.0 \n",
"3 1.0 1.27 748.0 16031.0 17712.0 10420.0 64.0 12855.0 \n",
"4 0.0 1.16 743.0 20070.0 19886.0 15727.0 76.0 11607.0 \n",
"\n",
" MAX_NOISE RXPACC ... CIR1006 CIR1007 CIR1008 CIR1009 CIR1010 \\\n",
"0 3668.0 1024.0 ... 818.0 938.0 588.0 277.0 727.0 \n",
"1 1031.0 1024.0 ... 289.0 228.0 107.0 487.0 491.0 \n",
"2 796.0 1024.0 ... 123.0 281.0 483.0 97.0 272.0 \n",
"3 1529.0 323.0 ... 169.0 138.0 219.0 94.0 225.0 \n",
"4 2022.0 296.0 ... 87.0 43.0 358.0 308.0 132.0 \n",
"\n",
" CIR1011 CIR1012 CIR1013 CIR1014 CIR1015 \n",
"0 367.0 803.0 819.0 467.0 768.0 \n",
"1 404.0 334.0 210.0 102.0 0.0 \n",
"2 73.0 125.0 169.0 182.0 0.0 \n",
"3 155.0 172.0 278.0 318.0 0.0 \n",
"4 131.0 102.0 126.0 163.0 0.0 \n",
"\n",
"[5 rows x 1031 columns]\n",
"Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n",
" 'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n",
" ...\n",
" 'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n",
" 'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n",
" dtype='object', length=1031)\n",
"No missing values\n"
] ]
} }
], ],
"source": [ "source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy import stats\n",
"\n",
"\n",
"def load_data(dataset_dir):\n", "def load_data(dataset_dir):\n",
" # Get all file paths in the directory\n", " # Load the data\n",
" file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n", " file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
"\n", " data = pd.concat((pd.read_csv(file_path) for file_path in file_paths))\n",
" # Load and concatenate all dataframes\n", " print(f\"Original data shape: {data.shape}\")\n",
" data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n",
"\n",
" return data\n", " return data\n",
"\n", "\n",
"\n", "\n",
"data = load_data(DATASET_DIR)\n", "def clean_data(data):\n",
" # Handle missing values\n",
" data = data.dropna()\n",
"\n", "\n",
"print(data.head())\n", " # Remove duplicates\n",
" data = data.drop_duplicates()\n",
"\n",
" # Convert data types\n",
" data['NLOS'] = data['NLOS'].astype(int)\n",
"\n",
" # Outlier detection and removal\n",
" z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))\n",
" data = data[(z_scores < 3).any(axis=1)]\n",
"\n",
" print(f\"Cleaned data shape: {data.shape}\")\n",
" return data\n",
"\n",
"\n",
"# Use the functions\n",
"data = load_data(DATASET_DIR)\n",
"data = clean_data(data)\n",
"\n",
"# print(data.head())\n",
"\n", "\n",
"# Print Headers\n", "# Print Headers\n",
"print(data.columns)\n", "# print(data.columns)"
"\n",
"# Check that there are no missing values\n",
"assert data.isnull().sum().sum() == 0\n",
"print(\"No missing values\")\n"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2024-02-25T02:29:51.084821398Z", "end_time": "2024-02-25T03:27:06.334698247Z",
"start_time": "2024-02-25T02:29:46.405675293Z" "start_time": "2024-02-25T03:26:58.458307532Z"
} }
}, },
"id": "dd9657f5ec6d7754", "id": "dd9657f5ec6d7754",
"execution_count": 40 "execution_count": 74
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@ -164,16 +157,19 @@
], ],
"source": [ "source": [
"from sklearn.decomposition import PCA\n", "from sklearn.decomposition import PCA\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import StandardScaler\n",
"\n", "\n",
"# Standardize the data\n", "# Standardize the data\n",
"data_std = StandardScaler().fit_transform(data)\n", "numerical_cols = data.select_dtypes(include=[np.number]).columns\n",
"scaler = StandardScaler()\n",
"data[numerical_cols] = scaler.fit_transform(data[numerical_cols])\n",
"\n", "\n",
"# Initialize PCA with the desired explained variance\n", "# Initialize PCA with the desired explained variance\n",
"pca = PCA(0.95)\n", "pca = PCA(0.95)\n",
"\n", "\n",
"# Fit PCA to your data\n", "# Fit PCA to your data\n",
"pca.fit(data_std)\n", "pca.fit(data)\n",
"\n", "\n",
"# Get the number of components\n", "# Get the number of components\n",
"num_components = pca.n_components_\n", "num_components = pca.n_components_\n",
@ -183,12 +179,12 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2024-02-25T02:29:58.267018142Z", "end_time": "2024-02-25T03:27:13.639843012Z",
"start_time": "2024-02-25T02:29:51.084440279Z" "start_time": "2024-02-25T03:27:06.336830842Z"
} }
}, },
"id": "7f9bec73a42f7bca", "id": "7f9bec73a42f7bca",
"execution_count": 41 "execution_count": 75
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
@ -217,7 +213,7 @@
], ],
"source": [ "source": [
"# Project original data to PC with the highest eigenvalue\n", "# Project original data to PC with the highest eigenvalue\n",
"data_pca = pca.transform(data_std)\n", "data_pca = pca.transform(data)\n",
"\n", "\n",
"# Create a dataframe with the principal components\n", "# Create a dataframe with the principal components\n",
"data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n", "data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
@ -237,12 +233,12 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"end_time": "2024-02-25T02:29:59.029369440Z", "end_time": "2024-02-25T03:27:14.422886263Z",
"start_time": "2024-02-25T02:29:58.266576678Z" "start_time": "2024-02-25T03:27:13.660170622Z"
} }
}, },
"id": "96c62c50f8734a01", "id": "96c62c50f8734a01",
"execution_count": 42 "execution_count": 76
} }
], ],
"metadata": { "metadata": {