Updated with Loading and Cleaning dataset

This commit is contained in:
devoalda 2024-02-25 11:28:34 +08:00
parent cd1abfbd9f
commit 3ca3e6b16f
1 changed files with 56 additions and 60 deletions

View File

@ -16,25 +16,26 @@
"source": [
"import os\n",
"\n",
"# Importing the libraries\n",
"import pandas as pd\n",
"\n",
"DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-02-25T02:29:46.399745088Z",
"start_time": "2024-02-25T02:29:46.386566147Z"
"end_time": "2024-02-25T03:26:58.464846949Z",
"start_time": "2024-02-25T03:26:58.415028614Z"
}
},
"id": "bcd6cbaa5df10ce8",
"execution_count": 39
"execution_count": 73
},
{
"cell_type": "markdown",
"source": [
"# Load the data into a pandas dataframe"
"# Load the data into a pandas dataframe\n",
"\n",
"The first step in any data analysis project is to load the data into a suitable data structure. In this case, we will use the `pandas` library to load the data into a dataframe.\n",
"\n",
"We then clean the data by handling missing values, removing duplicates, converting data types, and performing outlier detection and removal. "
],
"metadata": {
"collapsed": false
@ -48,69 +49,61 @@
"name": "stdout",
"output_type": "stream",
"text": [
" NLOS RANGE FP_IDX FP_AMP1 FP_AMP2 FP_AMP3 STDEV_NOISE CIR_PWR \\\n",
"0 1.0 6.18 749.0 4889.0 13876.0 10464.0 240.0 9048.0 \n",
"1 1.0 4.54 741.0 2474.0 2002.0 1593.0 68.0 6514.0 \n",
"2 1.0 4.39 744.0 1934.0 2615.0 4114.0 52.0 2880.0 \n",
"3 1.0 1.27 748.0 16031.0 17712.0 10420.0 64.0 12855.0 \n",
"4 0.0 1.16 743.0 20070.0 19886.0 15727.0 76.0 11607.0 \n",
"\n",
" MAX_NOISE RXPACC ... CIR1006 CIR1007 CIR1008 CIR1009 CIR1010 \\\n",
"0 3668.0 1024.0 ... 818.0 938.0 588.0 277.0 727.0 \n",
"1 1031.0 1024.0 ... 289.0 228.0 107.0 487.0 491.0 \n",
"2 796.0 1024.0 ... 123.0 281.0 483.0 97.0 272.0 \n",
"3 1529.0 323.0 ... 169.0 138.0 219.0 94.0 225.0 \n",
"4 2022.0 296.0 ... 87.0 43.0 358.0 308.0 132.0 \n",
"\n",
" CIR1011 CIR1012 CIR1013 CIR1014 CIR1015 \n",
"0 367.0 803.0 819.0 467.0 768.0 \n",
"1 404.0 334.0 210.0 102.0 0.0 \n",
"2 73.0 125.0 169.0 182.0 0.0 \n",
"3 155.0 172.0 278.0 318.0 0.0 \n",
"4 131.0 102.0 126.0 163.0 0.0 \n",
"\n",
"[5 rows x 1031 columns]\n",
"Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n",
" 'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n",
" ...\n",
" 'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n",
" 'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n",
" dtype='object', length=1031)\n",
"No missing values\n"
"Original data shape: (42000, 1031)\n",
"Cleaned data shape: (42000, 1031)\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy import stats\n",
"\n",
"\n",
"def load_data(dataset_dir):\n",
" # Get all file paths in the directory\n",
" # Load the data\n",
" file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
"\n",
" # Load and concatenate all dataframes\n",
" data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n",
"\n",
" data = pd.concat((pd.read_csv(file_path) for file_path in file_paths))\n",
" print(f\"Original data shape: {data.shape}\")\n",
" return data\n",
"\n",
"\n",
"data = load_data(DATASET_DIR)\n",
"def clean_data(data):\n",
" # Handle missing values\n",
" data = data.dropna()\n",
"\n",
"print(data.head())\n",
" # Remove duplicates\n",
" data = data.drop_duplicates()\n",
"\n",
" # Convert data types\n",
" data['NLOS'] = data['NLOS'].astype(int)\n",
"\n",
" # Outlier detection and removal\n",
" z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))\n",
" data = data[(z_scores < 3).any(axis=1)]\n",
"\n",
" print(f\"Cleaned data shape: {data.shape}\")\n",
" return data\n",
"\n",
"\n",
"# Use the functions\n",
"data = load_data(DATASET_DIR)\n",
"data = clean_data(data)\n",
"\n",
"# print(data.head())\n",
"\n",
"# Print Headers\n",
"print(data.columns)\n",
"\n",
"# Check that there are no missing values\n",
"assert data.isnull().sum().sum() == 0\n",
"print(\"No missing values\")\n"
"# print(data.columns)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-02-25T02:29:51.084821398Z",
"start_time": "2024-02-25T02:29:46.405675293Z"
"end_time": "2024-02-25T03:27:06.334698247Z",
"start_time": "2024-02-25T03:26:58.458307532Z"
}
},
"id": "dd9657f5ec6d7754",
"execution_count": 40
"execution_count": 74
},
{
"cell_type": "markdown",
@ -164,16 +157,19 @@
],
"source": [
"from sklearn.decomposition import PCA\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"# Standardize the data\n",
"data_std = StandardScaler().fit_transform(data)\n",
"numerical_cols = data.select_dtypes(include=[np.number]).columns\n",
"scaler = StandardScaler()\n",
"data[numerical_cols] = scaler.fit_transform(data[numerical_cols])\n",
"\n",
"# Initialize PCA with the desired explained variance\n",
"pca = PCA(0.95)\n",
"\n",
"# Fit PCA to your data\n",
"pca.fit(data_std)\n",
"pca.fit(data)\n",
"\n",
"# Get the number of components\n",
"num_components = pca.n_components_\n",
@ -183,12 +179,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-02-25T02:29:58.267018142Z",
"start_time": "2024-02-25T02:29:51.084440279Z"
"end_time": "2024-02-25T03:27:13.639843012Z",
"start_time": "2024-02-25T03:27:06.336830842Z"
}
},
"id": "7f9bec73a42f7bca",
"execution_count": 41
"execution_count": 75
},
{
"cell_type": "markdown",
@ -217,7 +213,7 @@
],
"source": [
"# Project original data to PC with the highest eigenvalue\n",
"data_pca = pca.transform(data_std)\n",
"data_pca = pca.transform(data)\n",
"\n",
"# Create a dataframe with the principal components\n",
"data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
@ -237,12 +233,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-02-25T02:29:59.029369440Z",
"start_time": "2024-02-25T02:29:58.266576678Z"
"end_time": "2024-02-25T03:27:14.422886263Z",
"start_time": "2024-02-25T03:27:13.660170622Z"
}
},
"id": "96c62c50f8734a01",
"execution_count": 42
"execution_count": 76
}
],
"metadata": {