Initial commit

2024-02-25 10:37:14 +08:00 · 2024-02-25 10:37:14 +08:00 · 905f3bc7b7
commit 905f3bc7b7
3 changed files with 764 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,477 @@
+### venv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+
+### TeX template
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.log
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+
+## Intermediate documents:
+*.dvi
+*.xdv
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+
+## Generated if empty string is given at "Please type another file name for output:"
+.pdf
+
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+
+## Build tool directories for auxiliary files
+# latexrun
+latex.out/
+
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+
+# achemso
+acs-*.bib
+
+# amsthm
+*.thm
+
+# beamer
+*.nav
+*.pre
+*.snm
+*.vrb
+
+# changes
+*.soc
+
+# comment
+*.cut
+
+# cprotect
+*.cpt
+
+# elsarticle (documentclass of Elsevier journals)
+*.spl
+
+# endnotes
+*.ent
+
+# fixme
+*.lox
+
+# feynmf/feynmp
+*.mf
+*.mp
+*.t[1-9]
+*.t[1-9][0-9]
+*.tfm
+
+#(r)(e)ledmac/(r)(e)ledpar
+*.end
+*.?end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+*.glsdefs
+*.lzo
+*.lzs
+*.slg
+*.slo
+*.sls
+
+# uncomment this for glossaries-extra (will ignore makeindex's style files!)
+# *.ist
+
+# gnuplot
+*.gnuplot
+*.table
+
+# gnuplottex
+*-gnuplottex-*
+
+# gregoriotex
+*.gaux
+*.glog
+*.gtex
+
+# htlatex
+*.4ct
+*.4tc
+*.idv
+*.lg
+*.trc
+*.xref
+
+# hyperref
+*.brf
+
+# knitr
+*-concordance.tex
+# TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
+# *.tikz
+*-tikzDictionary
+
+# listings
+*.lol
+
+# luatexja-ruby
+*.ltjruby
+
+# makeidx
+*.idx
+*.ilg
+*.ind
+
+# minitoc
+*.maf
+*.mlf
+*.mlt
+*.mtc[0-9]*
+*.slf[0-9]*
+*.slt[0-9]*
+*.stc[0-9]*
+
+# minted
+_minted*
+*.pyg
+
+# morewrites
+*.mw
+
+# newpax
+*.newpax
+
+# nomencl
+*.nlg
+*.nlo
+*.nls
+
+# pax
+*.pax
+
+# pdfpcnotes
+*.pdfpc
+
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+
+# scrwfile
+*.wrt
+
+# svg
+svg-inkscape/
+
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+
+# pdfcomment
+*.upa
+*.upb
+
+# pythontex
+*.pytxcode
+pythontex-files-*/
+
+# tcolorbox
+*.listing
+
+# thmtools
+*.loe
+
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+
+# titletoc
+*.ptc
+
+# todonotes
+*.tdo
+
+# vhistory
+*.hst
+*.ver
+
+# easy-todo
+*.lod
+
+# xcolor
+*.xcp
+
+# xmpincl
+*.xmpi
+
+# xindy
+*.xdy
+
+# xypic precompiled matrices and outlines
+*.xyc
+*.xyd
+
+# endfloat
+*.ttt
+*.fff
+
+# Latexian
+TSWLatexianTemp*
+
+## Editors:
+# WinEdt
+*.bak
+*.sav
+
+# Texpad
+.texpadtmp
+
+# LyX
+*.lyx~
+
+# Kile
+*.backup
+
+# gummi
+.*.swp
+
+# KBibTeX
+*~[0-9]*
+
+# TeXnicCenter
+*.tps
+
+# auto folder when using emacs and auctex
+./auto/*
+*.el
+
+# expex forward references with \gathertags
+*-tags.tex
+
+# standalone packages
+*.sta
+
+# Makeindex log files
+*.lpz
+
+# xwatermark package
+*.xwm
+
+# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
+# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
+# Uncomment the next line to have this generated file ignored.
+#*Notes.bib
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+/UWB-LOS-NLOS-Data-Set/
--- a/Project.ipynb
+++ b/Project.ipynb
@ -0,0 +1,269 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# CSC 3105 Project"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "cda961ffb493d00c"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Importing the libraries\n",
+    "import pandas as pd\n",
+    "\n",
+    "DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-25T02:29:46.399745088Z",
+     "start_time": "2024-02-25T02:29:46.386566147Z"
+    }
+   },
+   "id": "bcd6cbaa5df10ce8",
+   "execution_count": 39
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Load the data into a pandas dataframe"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "bab890d7b05e347e"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \\\n",
+      "0   1.0   6.18   749.0   4889.0  13876.0  10464.0        240.0   9048.0   \n",
+      "1   1.0   4.54   741.0   2474.0   2002.0   1593.0         68.0   6514.0   \n",
+      "2   1.0   4.39   744.0   1934.0   2615.0   4114.0         52.0   2880.0   \n",
+      "3   1.0   1.27   748.0  16031.0  17712.0  10420.0         64.0  12855.0   \n",
+      "4   0.0   1.16   743.0  20070.0  19886.0  15727.0         76.0  11607.0   \n",
+      "\n",
+      "   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \\\n",
+      "0     3668.0  1024.0  ...    818.0    938.0    588.0    277.0    727.0   \n",
+      "1     1031.0  1024.0  ...    289.0    228.0    107.0    487.0    491.0   \n",
+      "2      796.0  1024.0  ...    123.0    281.0    483.0     97.0    272.0   \n",
+      "3     1529.0   323.0  ...    169.0    138.0    219.0     94.0    225.0   \n",
+      "4     2022.0   296.0  ...     87.0     43.0    358.0    308.0    132.0   \n",
+      "\n",
+      "   CIR1011  CIR1012  CIR1013  CIR1014  CIR1015  \n",
+      "0    367.0    803.0    819.0    467.0    768.0  \n",
+      "1    404.0    334.0    210.0    102.0      0.0  \n",
+      "2     73.0    125.0    169.0    182.0      0.0  \n",
+      "3    155.0    172.0    278.0    318.0      0.0  \n",
+      "4    131.0    102.0    126.0    163.0      0.0  \n",
+      "\n",
+      "[5 rows x 1031 columns]\n",
+      "Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n",
+      "       'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n",
+      "       ...\n",
+      "       'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n",
+      "       'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n",
+      "      dtype='object', length=1031)\n",
+      "No missing values\n"
+     ]
+    }
+   ],
+   "source": [
+    "def load_data(dataset_dir):\n",
+    "    # Get all file paths in the directory\n",
+    "    file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
+    "\n",
+    "    # Load and concatenate all dataframes\n",
+    "    data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n",
+    "\n",
+    "    return data\n",
+    "\n",
+    "\n",
+    "data = load_data(DATASET_DIR)\n",
+    "\n",
+    "print(data.head())\n",
+    "\n",
+    "# Print Headers\n",
+    "print(data.columns)\n",
+    "\n",
+    "# Check that there are no missing values\n",
+    "assert data.isnull().sum().sum() == 0\n",
+    "print(\"No missing values\")\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-25T02:29:51.084821398Z",
+     "start_time": "2024-02-25T02:29:46.405675293Z"
+    }
+   },
+   "id": "dd9657f5ec6d7754",
+   "execution_count": 40
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The selected code is performing data standardization, which is a common preprocessing step in many machine learning workflows. \n",
+    "\n",
+    "The purpose of standardization is to transform the data such that it has a mean of 0 and a standard deviation of 1. This is done to ensure that all features have the same scale, which is a requirement for many machine learning algorithms.\n",
+    "\n",
+    "The mathematical formulas used in this process are as follows:\n",
+    "\n",
+    "1. Calculate the mean (μ) of the data:\n",
+    "\n",
+    "$$\n",
+    "\\mu = \\frac{1}{n} \\sum_{i=1}^{n} x_i\n",
+    "$$\n",
+    "Where:\n",
+    "- $n$ is the number of observations in the data\n",
+    "- $x_i$ is the value of the $i$-th observation\n",
+    "- $\\sum$ denotes the summation over all observations\n",
+    "\n",
+    "2. Standardize the data by subtracting the mean from each observation and dividing by the standard deviation:\n",
+    "\n",
+    "$$\n",
+    "\\text{Data}_i = \\frac{x_i - \\mu}{\\sigma}\n",
+    "$$\n",
+    "Where:\n",
+    "- $\\text{Data}_i$ is the standardized value of the $i$-th observation\n",
+    "- $\\sigma$ is the standard deviation of the data\n",
+    "- $x_i$ is the value of the $i$-th observation\n",
+    "- $\\mu$ is the mean of the data\n",
+    "\n",
+    "The `StandardScaler` class from the `sklearn.preprocessing` module is used to perform this standardization. The `fit_transform` method is used to calculate the mean and standard deviation of the data and then perform the standardization.\n",
+    "\n",
+    "**Note:** By setting the explained variance to 0.95, we are saying that we want to choose the smallest number of principal components such that 95% of the variance in the original data is retained. This means that the transformed data will retain 95% of the information of the original data, while potentially having fewer dimensions.\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "2c13064e20601717"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The number of principle components after PCA is 868\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "# Standardize the data\n",
+    "data_std = StandardScaler().fit_transform(data)\n",
+    "\n",
+    "# Initialize PCA with the desired explained variance\n",
+    "pca = PCA(0.95)\n",
+    "\n",
+    "# Fit PCA to your data\n",
+    "pca.fit(data_std)\n",
+    "\n",
+    "# Get the number of components\n",
+    "num_components = pca.n_components_\n",
+    "\n",
+    "print(f\"The number of principle components after PCA is {num_components}\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-25T02:29:58.267018142Z",
+     "start_time": "2024-02-25T02:29:51.084440279Z"
+    }
+   },
+   "id": "7f9bec73a42f7bca",
+   "execution_count": 41
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Perform Dimensionality Reduction with PCA\n",
+    "\n",
+    "We can use the `transform` method of the `PCA` object to project the original data onto the principal components. This will give us the transformed data with the desired number of components."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "dc9f8c0e194dd07d"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original number of components: 1031\n",
+      "Number of components after PCA: 868\n",
+      "PCA has successfully reduced the number of components.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Project original data to PC with the highest eigenvalue\n",
+    "data_pca = pca.transform(data_std)\n",
+    "\n",
+    "# Create a dataframe with the principal components\n",
+    "data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
+    "\n",
+    "# Print the number of components in the original and PCA transformed data\n",
+    "print(f\"Original number of components: {data.shape[1]}\")\n",
+    "print(f\"Number of components after PCA: {num_components}\")\n",
+    "\n",
+    "# Compare the number of components in the original and PCA transformed data\n",
+    "if data.shape[1] > num_components:\n",
+    "    print(\"PCA has successfully reduced the number of components.\")\n",
+    "elif data.shape[1] < num_components:\n",
+    "    print(\"Unexpectedly, PCA has increased the number of components.\")\n",
+    "else:\n",
+    "    print(\"The number of components remains unchanged after PCA.\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-02-25T02:29:59.029369440Z",
+     "start_time": "2024-02-25T02:29:58.266576678Z"
+    }
+   },
+   "id": "96c62c50f8734a01",
+   "execution_count": 42
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/README.md
+++ b/README.md
@ -0,0 +1,18 @@
+# CSC 3105 Project
+
+This project involves data preprocessing and dimensionality reduction using Principal Component Analysis (PCA). The data is first standardized to have a mean of 0 and a standard deviation of 1. Then, PCA is applied to reduce the dimensionality of the data while retaining 95% of the original variance.
+
+## File Structure
+
+The project consists of the following files and directories:
+
+- `Project.ipynb`: This is the main Jupyter notebook file where all the data processing.
+- `./UWB-LOS-NLOS-Data-Set/dataset`: This directory contains the dataset used in the project.
+
+# Installation and Setup
+
+```bash
+git clone <REPO_URL>
+cd <REPO_NAME>
+pip install -r requirements.txt
+```