From 905f3bc7b70c094265a850b49ef8dc0d9033b2fc Mon Sep 17 00:00:00 2001 From: devoalda Date: Sun, 25 Feb 2024 10:37:14 +0800 Subject: [PATCH] Initial commit --- .gitignore | 477 ++++++++++++++++++++++++++++++++++++++++++++++++++ Project.ipynb | 269 ++++++++++++++++++++++++++++ README.md | 18 ++ 3 files changed, 764 insertions(+) create mode 100644 .gitignore create mode 100644 Project.ipynb create mode 100644 README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2787f12 --- /dev/null +++ b/.gitignore @@ -0,0 +1,477 @@ +### venv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +pip-selfcheck.json + +### TeX template +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Intermediate documents: +*.dvi +*.xdv +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Build tool directories for auxiliary files +# latexrun +latex.out/ + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.pre +*.snm +*.vrb + +# changes +*.soc + +# comment +*.cut + +# cprotect +*.cpt + +# elsarticle (documentclass of Elsevier journals) +*.spl + +# endnotes +*.ent + +# fixme +*.lox + +# feynmf/feynmp +*.mf +*.mp +*.t[1-9] +*.t[1-9][0-9] +*.tfm + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs +*.lzo +*.lzs +*.slg +*.slo +*.sls + +# uncomment this for glossaries-extra (will ignore makeindex's style files!) +# *.ist + +# gnuplot +*.gnuplot +*.table + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.glog +*.gtex + +# htlatex +*.4ct +*.4tc +*.idv +*.lg +*.trc +*.xref + +# hyperref +*.brf + +# knitr +*-concordance.tex +# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files +# *.tikz +*-tikzDictionary + +# listings +*.lol + +# luatexja-ruby +*.ltjruby + +# makeidx +*.idx +*.ilg +*.ind + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc[0-9]* +*.slf[0-9]* +*.slt[0-9]* +*.stc[0-9]* + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# newpax +*.newpax + +# nomencl +*.nlg +*.nlo +*.nls + +# pax +*.pax + +# pdfpcnotes +*.pdfpc + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# svg +svg-inkscape/ + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# tcolorbox +*.listing + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# titletoc +*.ptc + +# todonotes +*.tdo + +# vhistory +*.hst +*.ver + +# easy-todo +*.lod + +# xcolor +*.xcp + +# xmpincl +*.xmpi + +# xindy +*.xdy + +# xypic precompiled matrices and outlines +*.xyc +*.xyd + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# LyX +*.lyx~ + +# Kile +*.backup + +# gummi +.*.swp + +# KBibTeX +*~[0-9]* + +# TeXnicCenter +*.tps + +# auto folder when using emacs and auctex +./auto/* +*.el + +# expex forward references with \gathertags +*-tags.tex + +# standalone packages +*.sta + +# Makeindex log files +*.lpz + +# xwatermark package +*.xwm + +# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib +# option is specified. Footnotes are the stored in a file with suffix Notes.bib. +# Uncomment the next line to have this generated file ignored. +#*Notes.bib + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +/UWB-LOS-NLOS-Data-Set/ diff --git a/Project.ipynb b/Project.ipynb new file mode 100644 index 0000000..68be8cd --- /dev/null +++ b/Project.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# CSC 3105 Project" + ], + "metadata": { + "collapsed": false + }, + "id": "cda961ffb493d00c" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Importing the libraries\n", + "import pandas as pd\n", + "\n", + "DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-25T02:29:46.399745088Z", + "start_time": "2024-02-25T02:29:46.386566147Z" + } + }, + "id": "bcd6cbaa5df10ce8", + "execution_count": 39 + }, + { + "cell_type": "markdown", + "source": [ + "# Load the data into a pandas dataframe" + ], + "metadata": { + "collapsed": false + }, + "id": "bab890d7b05e347e" + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " NLOS RANGE FP_IDX FP_AMP1 FP_AMP2 FP_AMP3 STDEV_NOISE CIR_PWR \\\n", + "0 1.0 6.18 749.0 4889.0 13876.0 10464.0 240.0 9048.0 \n", + "1 1.0 4.54 741.0 2474.0 2002.0 1593.0 68.0 6514.0 \n", + "2 1.0 4.39 744.0 1934.0 2615.0 4114.0 52.0 2880.0 \n", + "3 1.0 1.27 748.0 16031.0 17712.0 10420.0 64.0 12855.0 \n", + "4 0.0 1.16 743.0 20070.0 19886.0 15727.0 76.0 11607.0 \n", + "\n", + " MAX_NOISE RXPACC ... CIR1006 CIR1007 CIR1008 CIR1009 CIR1010 \\\n", + "0 3668.0 1024.0 ... 818.0 938.0 588.0 277.0 727.0 \n", + "1 1031.0 1024.0 ... 289.0 228.0 107.0 487.0 491.0 \n", + "2 796.0 1024.0 ... 123.0 281.0 483.0 97.0 272.0 \n", + "3 1529.0 323.0 ... 169.0 138.0 219.0 94.0 225.0 \n", + "4 2022.0 296.0 ... 87.0 43.0 358.0 308.0 132.0 \n", + "\n", + " CIR1011 CIR1012 CIR1013 CIR1014 CIR1015 \n", + "0 367.0 803.0 819.0 467.0 768.0 \n", + "1 404.0 334.0 210.0 102.0 0.0 \n", + "2 73.0 125.0 169.0 182.0 0.0 \n", + "3 155.0 172.0 278.0 318.0 0.0 \n", + "4 131.0 102.0 126.0 163.0 0.0 \n", + "\n", + "[5 rows x 1031 columns]\n", + "Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n", + " 'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n", + " ...\n", + " 'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n", + " 'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n", + " dtype='object', length=1031)\n", + "No missing values\n" + ] + } + ], + "source": [ + "def load_data(dataset_dir):\n", + " # Get all file paths in the directory\n", + " file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n", + "\n", + " # Load and concatenate all dataframes\n", + " data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n", + "\n", + " return data\n", + "\n", + "\n", + "data = load_data(DATASET_DIR)\n", + "\n", + "print(data.head())\n", + "\n", + "# Print Headers\n", + "print(data.columns)\n", + "\n", + "# Check that there are no missing values\n", + "assert data.isnull().sum().sum() == 0\n", + "print(\"No missing values\")\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-25T02:29:51.084821398Z", + "start_time": "2024-02-25T02:29:46.405675293Z" + } + }, + "id": "dd9657f5ec6d7754", + "execution_count": 40 + }, + { + "cell_type": "markdown", + "source": [ + "The selected code is performing data standardization, which is a common preprocessing step in many machine learning workflows. \n", + "\n", + "The purpose of standardization is to transform the data such that it has a mean of 0 and a standard deviation of 1. This is done to ensure that all features have the same scale, which is a requirement for many machine learning algorithms.\n", + "\n", + "The mathematical formulas used in this process are as follows:\n", + "\n", + "1. Calculate the mean (μ) of the data:\n", + "\n", + "$$\n", + "\\mu = \\frac{1}{n} \\sum_{i=1}^{n} x_i\n", + "$$\n", + "Where:\n", + "- $n$ is the number of observations in the data\n", + "- $x_i$ is the value of the $i$-th observation\n", + "- $\\sum$ denotes the summation over all observations\n", + "\n", + "2. Standardize the data by subtracting the mean from each observation and dividing by the standard deviation:\n", + "\n", + "$$\n", + "\\text{Data}_i = \\frac{x_i - \\mu}{\\sigma}\n", + "$$\n", + "Where:\n", + "- $\\text{Data}_i$ is the standardized value of the $i$-th observation\n", + "- $\\sigma$ is the standard deviation of the data\n", + "- $x_i$ is the value of the $i$-th observation\n", + "- $\\mu$ is the mean of the data\n", + "\n", + "The `StandardScaler` class from the `sklearn.preprocessing` module is used to perform this standardization. The `fit_transform` method is used to calculate the mean and standard deviation of the data and then perform the standardization.\n", + "\n", + "**Note:** By setting the explained variance to 0.95, we are saying that we want to choose the smallest number of principal components such that 95% of the variance in the original data is retained. This means that the transformed data will retain 95% of the information of the original data, while potentially having fewer dimensions.\n" + ], + "metadata": { + "collapsed": false + }, + "id": "2c13064e20601717" + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The number of principle components after PCA is 868\n" + ] + } + ], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# Standardize the data\n", + "data_std = StandardScaler().fit_transform(data)\n", + "\n", + "# Initialize PCA with the desired explained variance\n", + "pca = PCA(0.95)\n", + "\n", + "# Fit PCA to your data\n", + "pca.fit(data_std)\n", + "\n", + "# Get the number of components\n", + "num_components = pca.n_components_\n", + "\n", + "print(f\"The number of principle components after PCA is {num_components}\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-25T02:29:58.267018142Z", + "start_time": "2024-02-25T02:29:51.084440279Z" + } + }, + "id": "7f9bec73a42f7bca", + "execution_count": 41 + }, + { + "cell_type": "markdown", + "source": [ + "# Perform Dimensionality Reduction with PCA\n", + "\n", + "We can use the `transform` method of the `PCA` object to project the original data onto the principal components. This will give us the transformed data with the desired number of components." + ], + "metadata": { + "collapsed": false + }, + "id": "dc9f8c0e194dd07d" + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original number of components: 1031\n", + "Number of components after PCA: 868\n", + "PCA has successfully reduced the number of components.\n" + ] + } + ], + "source": [ + "# Project original data to PC with the highest eigenvalue\n", + "data_pca = pca.transform(data_std)\n", + "\n", + "# Create a dataframe with the principal components\n", + "data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n", + "\n", + "# Print the number of components in the original and PCA transformed data\n", + "print(f\"Original number of components: {data.shape[1]}\")\n", + "print(f\"Number of components after PCA: {num_components}\")\n", + "\n", + "# Compare the number of components in the original and PCA transformed data\n", + "if data.shape[1] > num_components:\n", + " print(\"PCA has successfully reduced the number of components.\")\n", + "elif data.shape[1] < num_components:\n", + " print(\"Unexpectedly, PCA has increased the number of components.\")\n", + "else:\n", + " print(\"The number of components remains unchanged after PCA.\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-25T02:29:59.029369440Z", + "start_time": "2024-02-25T02:29:58.266576678Z" + } + }, + "id": "96c62c50f8734a01", + "execution_count": 42 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..e4fd770 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# CSC 3105 Project + +This project involves data preprocessing and dimensionality reduction using Principal Component Analysis (PCA). The data is first standardized to have a mean of 0 and a standard deviation of 1. Then, PCA is applied to reduce the dimensionality of the data while retaining 95% of the original variance. + +## File Structure + +The project consists of the following files and directories: + +- `Project.ipynb`: This is the main Jupyter notebook file where all the data processing. +- `./UWB-LOS-NLOS-Data-Set/dataset`: This directory contains the dataset used in the project. + +# Installation and Setup + +```bash +git clone +cd +pip install -r requirements.txt +``` \ No newline at end of file