# CSC 3105 Project

In [39]:
import os

# Importing the libraries
import pandas as pd

DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'

# Load the data into a pandas dataframe

In [40]:
def load_data(dataset_dir):
    # Get all file paths in the directory
    file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]

    # Load and concatenate all dataframes
    data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])

    return data


data = load_data(DATASET_DIR)

print(data.head())

# Print Headers
print(data.columns)

# Check that there are no missing values
assert data.isnull().sum().sum() == 0
print("No missing values")


   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \
0   1.0   6.18   749.0   4889.0  13876.0  10464.0        240.0   9048.0   
1   1.0   4.54   741.0   2474.0   2002.0   1593.0         68.0   6514.0   
2   1.0   4.39   744.0   1934.0   2615.0   4114.0         52.0   2880.0   
3   1.0   1.27   748.0  16031.0  17712.0  10420.0         64.0  12855.0   
4   0.0   1.16   743.0  20070.0  19886.0  15727.0         76.0  11607.0   

   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \
0     3668.0  1024.0  ...    818.0    938.0    588.0    277.0    727.0   
1     1031.0  1024.0  ...    289.0    228.0    107.0    487.0    491.0   
2      796.0  1024.0  ...    123.0    281.0    483.0     97.0    272.0   
3     1529.0   323.0  ...    169.0    138.0    219.0     94.0    225.0   
4     2022.0   296.0  ...     87.0     43.0    358.0    308.0    132.0   

   CIR1011  CIR1012  CIR1013  CIR1014  CIR1015  
0    367.0    803.0    819.0    467.0    768.0  
1    4

The selected code is performing data standardization, which is a common preprocessing step in many machine learning workflows. 

The purpose of standardization is to transform the data such that it has a mean of 0 and a standard deviation of 1. This is done to ensure that all features have the same scale, which is a requirement for many machine learning algorithms.

The mathematical formulas used in this process are as follows:

1. Calculate the mean (Î¼) of the data:

$$
\mu = \frac{1}{n} \sum_{i=1}^{n} x_i
$$
Where:
- $n$ is the number of observations in the data
- $x_i$ is the value of the $i$-th observation
- $\sum$ denotes the summation over all observations

2. Standardize the data by subtracting the mean from each observation and dividing by the standard deviation:

$$
\text{Data}_i = \frac{x_i - \mu}{\sigma}
$$
Where:
- $\text{Data}_i$ is the standardized value of the $i$-th observation
- $\sigma$ is the standard deviation of the data
- $x_i$ is the value of the $i$-th observation
- $\mu$ is the mean of the data

The `StandardScaler` class from the `sklearn.preprocessing` module is used to perform this standardization. The `fit_transform` method is used to calculate the mean and standard deviation of the data and then perform the standardization.

**Note:** By setting the explained variance to 0.95, we are saying that we want to choose the smallest number of principal components such that 95% of the variance in the original data is retained. This means that the transformed data will retain 95% of the information of the original data, while potentially having fewer dimensions.


In [41]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
data_std = StandardScaler().fit_transform(data)

# Initialize PCA with the desired explained variance
pca = PCA(0.95)

# Fit PCA to your data
pca.fit(data_std)

# Get the number of components
num_components = pca.n_components_

print(f"The number of principle components after PCA is {num_components}")

The number of principle components after PCA is 868


# Perform Dimensionality Reduction with PCA

We can use the `transform` method of the `PCA` object to project the original data onto the principal components. This will give us the transformed data with the desired number of components.

In [42]:
# Project original data to PC with the highest eigenvalue
data_pca = pca.transform(data_std)

# Create a dataframe with the principal components
data_pca_df = pd.DataFrame(data_pca, columns=[f"PC{i}" for i in range(1, num_components + 1)])

# Print the number of components in the original and PCA transformed data
print(f"Original number of components: {data.shape[1]}")
print(f"Number of components after PCA: {num_components}")

# Compare the number of components in the original and PCA transformed data
if data.shape[1] > num_components:
    print("PCA has successfully reduced the number of components.")
elif data.shape[1] < num_components:
    print("Unexpectedly, PCA has increased the number of components.")
else:
    print("The number of components remains unchanged after PCA.")

Original number of components: 1031
Number of components after PCA: 868
PCA has successfully reduced the number of components.
