Updated with Loading and Cleaning dataset
This commit is contained in:
parent
cd1abfbd9f
commit
3ca3e6b16f
116
Project.ipynb
116
Project.ipynb
|
@ -16,25 +16,26 @@
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Importing the libraries\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"\n",
|
|
||||||
"DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
|
"DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2024-02-25T02:29:46.399745088Z",
|
"end_time": "2024-02-25T03:26:58.464846949Z",
|
||||||
"start_time": "2024-02-25T02:29:46.386566147Z"
|
"start_time": "2024-02-25T03:26:58.415028614Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"id": "bcd6cbaa5df10ce8",
|
"id": "bcd6cbaa5df10ce8",
|
||||||
"execution_count": 39
|
"execution_count": 73
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
"source": [
|
||||||
"# Load the data into a pandas dataframe"
|
"# Load the data into a pandas dataframe\n",
|
||||||
|
"\n",
|
||||||
|
"The first step in any data analysis project is to load the data into a suitable data structure. In this case, we will use the `pandas` library to load the data into a dataframe.\n",
|
||||||
|
"\n",
|
||||||
|
"We then clean the data by handling missing values, removing duplicates, converting data types, and performing outlier detection and removal. "
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
|
@ -48,69 +49,61 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
" NLOS RANGE FP_IDX FP_AMP1 FP_AMP2 FP_AMP3 STDEV_NOISE CIR_PWR \\\n",
|
"Original data shape: (42000, 1031)\n",
|
||||||
"0 1.0 6.18 749.0 4889.0 13876.0 10464.0 240.0 9048.0 \n",
|
"Cleaned data shape: (42000, 1031)\n"
|
||||||
"1 1.0 4.54 741.0 2474.0 2002.0 1593.0 68.0 6514.0 \n",
|
|
||||||
"2 1.0 4.39 744.0 1934.0 2615.0 4114.0 52.0 2880.0 \n",
|
|
||||||
"3 1.0 1.27 748.0 16031.0 17712.0 10420.0 64.0 12855.0 \n",
|
|
||||||
"4 0.0 1.16 743.0 20070.0 19886.0 15727.0 76.0 11607.0 \n",
|
|
||||||
"\n",
|
|
||||||
" MAX_NOISE RXPACC ... CIR1006 CIR1007 CIR1008 CIR1009 CIR1010 \\\n",
|
|
||||||
"0 3668.0 1024.0 ... 818.0 938.0 588.0 277.0 727.0 \n",
|
|
||||||
"1 1031.0 1024.0 ... 289.0 228.0 107.0 487.0 491.0 \n",
|
|
||||||
"2 796.0 1024.0 ... 123.0 281.0 483.0 97.0 272.0 \n",
|
|
||||||
"3 1529.0 323.0 ... 169.0 138.0 219.0 94.0 225.0 \n",
|
|
||||||
"4 2022.0 296.0 ... 87.0 43.0 358.0 308.0 132.0 \n",
|
|
||||||
"\n",
|
|
||||||
" CIR1011 CIR1012 CIR1013 CIR1014 CIR1015 \n",
|
|
||||||
"0 367.0 803.0 819.0 467.0 768.0 \n",
|
|
||||||
"1 404.0 334.0 210.0 102.0 0.0 \n",
|
|
||||||
"2 73.0 125.0 169.0 182.0 0.0 \n",
|
|
||||||
"3 155.0 172.0 278.0 318.0 0.0 \n",
|
|
||||||
"4 131.0 102.0 126.0 163.0 0.0 \n",
|
|
||||||
"\n",
|
|
||||||
"[5 rows x 1031 columns]\n",
|
|
||||||
"Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n",
|
|
||||||
" 'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n",
|
|
||||||
" ...\n",
|
|
||||||
" 'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n",
|
|
||||||
" 'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n",
|
|
||||||
" dtype='object', length=1031)\n",
|
|
||||||
"No missing values\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from scipy import stats\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def load_data(dataset_dir):\n",
|
"def load_data(dataset_dir):\n",
|
||||||
" # Get all file paths in the directory\n",
|
" # Load the data\n",
|
||||||
" file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
|
" file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
|
||||||
"\n",
|
" data = pd.concat((pd.read_csv(file_path) for file_path in file_paths))\n",
|
||||||
" # Load and concatenate all dataframes\n",
|
" print(f\"Original data shape: {data.shape}\")\n",
|
||||||
" data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n",
|
|
||||||
"\n",
|
|
||||||
" return data\n",
|
" return data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"data = load_data(DATASET_DIR)\n",
|
"def clean_data(data):\n",
|
||||||
|
" # Handle missing values\n",
|
||||||
|
" data = data.dropna()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(data.head())\n",
|
" # Remove duplicates\n",
|
||||||
|
" data = data.drop_duplicates()\n",
|
||||||
|
"\n",
|
||||||
|
" # Convert data types\n",
|
||||||
|
" data['NLOS'] = data['NLOS'].astype(int)\n",
|
||||||
|
"\n",
|
||||||
|
" # Outlier detection and removal\n",
|
||||||
|
" z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))\n",
|
||||||
|
" data = data[(z_scores < 3).any(axis=1)]\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"Cleaned data shape: {data.shape}\")\n",
|
||||||
|
" return data\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Use the functions\n",
|
||||||
|
"data = load_data(DATASET_DIR)\n",
|
||||||
|
"data = clean_data(data)\n",
|
||||||
|
"\n",
|
||||||
|
"# print(data.head())\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Print Headers\n",
|
"# Print Headers\n",
|
||||||
"print(data.columns)\n",
|
"# print(data.columns)"
|
||||||
"\n",
|
|
||||||
"# Check that there are no missing values\n",
|
|
||||||
"assert data.isnull().sum().sum() == 0\n",
|
|
||||||
"print(\"No missing values\")\n"
|
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2024-02-25T02:29:51.084821398Z",
|
"end_time": "2024-02-25T03:27:06.334698247Z",
|
||||||
"start_time": "2024-02-25T02:29:46.405675293Z"
|
"start_time": "2024-02-25T03:26:58.458307532Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"id": "dd9657f5ec6d7754",
|
"id": "dd9657f5ec6d7754",
|
||||||
"execution_count": 40
|
"execution_count": 74
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
@ -164,16 +157,19 @@
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.decomposition import PCA\n",
|
"from sklearn.decomposition import PCA\n",
|
||||||
|
"\n",
|
||||||
"from sklearn.preprocessing import StandardScaler\n",
|
"from sklearn.preprocessing import StandardScaler\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Standardize the data\n",
|
"# Standardize the data\n",
|
||||||
"data_std = StandardScaler().fit_transform(data)\n",
|
"numerical_cols = data.select_dtypes(include=[np.number]).columns\n",
|
||||||
|
"scaler = StandardScaler()\n",
|
||||||
|
"data[numerical_cols] = scaler.fit_transform(data[numerical_cols])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Initialize PCA with the desired explained variance\n",
|
"# Initialize PCA with the desired explained variance\n",
|
||||||
"pca = PCA(0.95)\n",
|
"pca = PCA(0.95)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Fit PCA to your data\n",
|
"# Fit PCA to your data\n",
|
||||||
"pca.fit(data_std)\n",
|
"pca.fit(data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Get the number of components\n",
|
"# Get the number of components\n",
|
||||||
"num_components = pca.n_components_\n",
|
"num_components = pca.n_components_\n",
|
||||||
|
@ -183,12 +179,12 @@
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2024-02-25T02:29:58.267018142Z",
|
"end_time": "2024-02-25T03:27:13.639843012Z",
|
||||||
"start_time": "2024-02-25T02:29:51.084440279Z"
|
"start_time": "2024-02-25T03:27:06.336830842Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"id": "7f9bec73a42f7bca",
|
"id": "7f9bec73a42f7bca",
|
||||||
"execution_count": 41
|
"execution_count": 75
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
@ -217,7 +213,7 @@
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Project original data to PC with the highest eigenvalue\n",
|
"# Project original data to PC with the highest eigenvalue\n",
|
||||||
"data_pca = pca.transform(data_std)\n",
|
"data_pca = pca.transform(data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Create a dataframe with the principal components\n",
|
"# Create a dataframe with the principal components\n",
|
||||||
"data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
|
"data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
|
||||||
|
@ -237,12 +233,12 @@
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2024-02-25T02:29:59.029369440Z",
|
"end_time": "2024-02-25T03:27:14.422886263Z",
|
||||||
"start_time": "2024-02-25T02:29:58.266576678Z"
|
"start_time": "2024-02-25T03:27:13.660170622Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"id": "96c62c50f8734a01",
|
"id": "96c62c50f8734a01",
|
||||||
"execution_count": 42
|
"execution_count": 76
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
Loading…
Reference in New Issue