Initial commit

2024-02-25 10:37:14 +08:00 · 2024-02-25 10:37:14 +08:00 · 905f3bc7b7
commit 905f3bc7b7
3 changed files with 764 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,477 @@
 ### venv template
 # Virtualenv
 # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 .Python
 [Bb]in
 [Ii]nclude
 [Ll]ib
 [Ll]ib64
 [Ll]ocal
 [Ss]cripts
 pyvenv.cfg
 .venv
 pip-selfcheck.json
 ### TeX template
 ## Core latex/pdflatex auxiliary files:
 *.aux
 *.lof
 *.log
 *.lot
 *.fls
 *.out
 *.toc
 *.fmt
 *.fot
 *.cb
 *.cb2
 .*.lb
 ## Intermediate documents:
 *.dvi
 *.xdv
 *-converted-to.*
 # these rules might exclude image files for figures etc.
 # *.ps
 # *.eps
 # *.pdf
 ## Generated if empty string is given at "Please type another file name for output:"
 .pdf
 ## Bibliography auxiliary files (bibtex/biblatex/biber):
 *.bbl
 *.bcf
 *.blg
 *-blx.aux
 *-blx.bib
 *.run.xml
 ## Build tool auxiliary files:
 *.fdb_latexmk
 *.synctex
 *.synctex(busy)
 *.synctex.gz
 *.synctex.gz(busy)
 *.pdfsync
 ## Build tool directories for auxiliary files
 # latexrun
 latex.out/
 ## Auxiliary and intermediate files from other packages:
 # algorithms
 *.alg
 *.loa
 # achemso
 acs-*.bib
 # amsthm
 *.thm
 # beamer
 *.nav
 *.pre
 *.snm
 *.vrb
 # changes
 *.soc
 # comment
 *.cut
 # cprotect
 *.cpt
 # elsarticle (documentclass of Elsevier journals)
 *.spl
 # endnotes
 *.ent
 # fixme
 *.lox
 # feynmf/feynmp
 *.mf
 *.mp
 *.t[1-9]
 *.t[1-9][0-9]
 *.tfm
 #(r)(e)ledmac/(r)(e)ledpar
 *.end
 *.?end
 *.[1-9]
 *.[1-9][0-9]
 *.[1-9][0-9][0-9]
 *.[1-9]R
 *.[1-9][0-9]R
 *.[1-9][0-9][0-9]R
 *.eledsec[1-9]
 *.eledsec[1-9]R
 *.eledsec[1-9][0-9]
 *.eledsec[1-9][0-9]R
 *.eledsec[1-9][0-9][0-9]
 *.eledsec[1-9][0-9][0-9]R
 # glossaries
 *.acn
 *.acr
 *.glg
 *.glo
 *.gls
 *.glsdefs
 *.lzo
 *.lzs
 *.slg
 *.slo
 *.sls
 # uncomment this for glossaries-extra (will ignore makeindex's style files!)
 # *.ist
 # gnuplot
 *.gnuplot
 *.table
 # gnuplottex
 *-gnuplottex-*
 # gregoriotex
 *.gaux
 *.glog
 *.gtex
 # htlatex
 *.4ct
 *.4tc
 *.idv
 *.lg
 *.trc
 *.xref
 # hyperref
 *.brf
 # knitr
 *-concordance.tex
 # TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
 # *.tikz
 *-tikzDictionary
 # listings
 *.lol
 # luatexja-ruby
 *.ltjruby
 # makeidx
 *.idx
 *.ilg
 *.ind
 # minitoc
 *.maf
 *.mlf
 *.mlt
 *.mtc[0-9]*
 *.slf[0-9]*
 *.slt[0-9]*
 *.stc[0-9]*
 # minted
 _minted*
 *.pyg
 # morewrites
 *.mw
 # newpax
 *.newpax
 # nomencl
 *.nlg
 *.nlo
 *.nls
 # pax
 *.pax
 # pdfpcnotes
 *.pdfpc
 # sagetex
 *.sagetex.sage
 *.sagetex.py
 *.sagetex.scmd
 # scrwfile
 *.wrt
 # svg
 svg-inkscape/
 # sympy
 *.sout
 *.sympy
 sympy-plots-for-*.tex/
 # pdfcomment
 *.upa
 *.upb
 # pythontex
 *.pytxcode
 pythontex-files-*/
 # tcolorbox
 *.listing
 # thmtools
 *.loe
 # TikZ & PGF
 *.dpth
 *.md5
 *.auxlock
 # titletoc
 *.ptc
 # todonotes
 *.tdo
 # vhistory
 *.hst
 *.ver
 # easy-todo
 *.lod
 # xcolor
 *.xcp
 # xmpincl
 *.xmpi
 # xindy
 *.xdy
 # xypic precompiled matrices and outlines
 *.xyc
 *.xyd
 # endfloat
 *.ttt
 *.fff
 # Latexian
 TSWLatexianTemp*
 ## Editors:
 # WinEdt
 *.bak
 *.sav
 # Texpad
 .texpadtmp
 # LyX
 *.lyx~
 # Kile
 *.backup
 # gummi
 .*.swp
 # KBibTeX
 *~[0-9]*
 # TeXnicCenter
 *.tps
 # auto folder when using emacs and auctex
 ./auto/*
 *.el
 # expex forward references with \gathertags
 *-tags.tex
 # standalone packages
 *.sta
 # Makeindex log files
 *.lpz
 # xwatermark package
 *.xwm
 # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
 # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
 ### Python template
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 /UWB-LOS-NLOS-Data-Set/
--- a/Project.ipynb
+++ b/Project.ipynb
@ -0,0 +1,269 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# CSC 3105 Project"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "cda961ffb493d00c"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Importing the libraries\n",
    "import pandas as pd\n",
    "\n",
    "DATASET_DIR = './UWB-LOS-NLOS-Data-Set/dataset'"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-02-25T02:29:46.399745088Z",
     "start_time": "2024-02-25T02:29:46.386566147Z"
    }
   },
   "id": "bcd6cbaa5df10ce8",
   "execution_count": 39
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Load the data into a pandas dataframe"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "bab890d7b05e347e"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   NLOS  RANGE  FP_IDX  FP_AMP1  FP_AMP2  FP_AMP3  STDEV_NOISE  CIR_PWR  \\\n",
      "0   1.0   6.18   749.0   4889.0  13876.0  10464.0        240.0   9048.0   \n",
      "1   1.0   4.54   741.0   2474.0   2002.0   1593.0         68.0   6514.0   \n",
      "2   1.0   4.39   744.0   1934.0   2615.0   4114.0         52.0   2880.0   \n",
      "3   1.0   1.27   748.0  16031.0  17712.0  10420.0         64.0  12855.0   \n",
      "4   0.0   1.16   743.0  20070.0  19886.0  15727.0         76.0  11607.0   \n",
      "\n",
      "   MAX_NOISE  RXPACC  ...  CIR1006  CIR1007  CIR1008  CIR1009  CIR1010  \\\n",
      "0     3668.0  1024.0  ...    818.0    938.0    588.0    277.0    727.0   \n",
      "1     1031.0  1024.0  ...    289.0    228.0    107.0    487.0    491.0   \n",
      "2      796.0  1024.0  ...    123.0    281.0    483.0     97.0    272.0   \n",
      "3     1529.0   323.0  ...    169.0    138.0    219.0     94.0    225.0   \n",
      "4     2022.0   296.0  ...     87.0     43.0    358.0    308.0    132.0   \n",
      "\n",
      "   CIR1011  CIR1012  CIR1013  CIR1014  CIR1015  \n",
      "0    367.0    803.0    819.0    467.0    768.0  \n",
      "1    404.0    334.0    210.0    102.0      0.0  \n",
      "2     73.0    125.0    169.0    182.0      0.0  \n",
      "3    155.0    172.0    278.0    318.0      0.0  \n",
      "4    131.0    102.0    126.0    163.0      0.0  \n",
      "\n",
      "[5 rows x 1031 columns]\n",
      "Index(['NLOS', 'RANGE', 'FP_IDX', 'FP_AMP1', 'FP_AMP2', 'FP_AMP3',\n",
      "       'STDEV_NOISE', 'CIR_PWR', 'MAX_NOISE', 'RXPACC',\n",
      "       ...\n",
      "       'CIR1006', 'CIR1007', 'CIR1008', 'CIR1009', 'CIR1010', 'CIR1011',\n",
      "       'CIR1012', 'CIR1013', 'CIR1014', 'CIR1015'],\n",
      "      dtype='object', length=1031)\n",
      "No missing values\n"
     ]
    }
   ],
   "source": [
    "def load_data(dataset_dir):\n",
    "    # Get all file paths in the directory\n",
    "    file_paths = [os.path.join(dirpath, file) for dirpath, _, filenames in os.walk(dataset_dir) for file in filenames]\n",
    "\n",
    "    # Load and concatenate all dataframes\n",
    "    data = pd.concat([pd.read_csv(file_path) for file_path in file_paths])\n",
    "\n",
    "    return data\n",
    "\n",
    "\n",
    "data = load_data(DATASET_DIR)\n",
    "\n",
    "print(data.head())\n",
    "\n",
    "# Print Headers\n",
    "print(data.columns)\n",
    "\n",
    "# Check that there are no missing values\n",
    "assert data.isnull().sum().sum() == 0\n",
    "print(\"No missing values\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-02-25T02:29:51.084821398Z",
     "start_time": "2024-02-25T02:29:46.405675293Z"
    }
   },
   "id": "dd9657f5ec6d7754",
   "execution_count": 40
  },
  {
   "cell_type": "markdown",
   "source": [
    "The selected code is performing data standardization, which is a common preprocessing step in many machine learning workflows. \n",
    "\n",
    "The purpose of standardization is to transform the data such that it has a mean of 0 and a standard deviation of 1. This is done to ensure that all features have the same scale, which is a requirement for many machine learning algorithms.\n",
    "\n",
    "The mathematical formulas used in this process are as follows:\n",
    "\n",
    "1. Calculate the mean (μ) of the data:\n",
    "\n",
    "$$\n",
    "\\mu = \\frac{1}{n} \\sum_{i=1}^{n} x_i\n",
    "$$\n",
    "Where:\n",
    "- $n$ is the number of observations in the data\n",
    "- $x_i$ is the value of the $i$-th observation\n",
    "- $\\sum$ denotes the summation over all observations\n",
    "\n",
    "2. Standardize the data by subtracting the mean from each observation and dividing by the standard deviation:\n",
    "\n",
    "$$\n",
    "\\text{Data}_i = \\frac{x_i - \\mu}{\\sigma}\n",
    "$$\n",
    "Where:\n",
    "- $\\text{Data}_i$ is the standardized value of the $i$-th observation\n",
    "- $\\sigma$ is the standard deviation of the data\n",
    "- $x_i$ is the value of the $i$-th observation\n",
    "- $\\mu$ is the mean of the data\n",
    "\n",
    "The `StandardScaler` class from the `sklearn.preprocessing` module is used to perform this standardization. The `fit_transform` method is used to calculate the mean and standard deviation of the data and then perform the standardization.\n",
    "\n",
    "**Note:** By setting the explained variance to 0.95, we are saying that we want to choose the smallest number of principal components such that 95% of the variance in the original data is retained. This means that the transformed data will retain 95% of the information of the original data, while potentially having fewer dimensions.\n"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "2c13064e20601717"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of principle components after PCA is 868\n"
     ]
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# Standardize the data\n",
    "data_std = StandardScaler().fit_transform(data)\n",
    "\n",
    "# Initialize PCA with the desired explained variance\n",
    "pca = PCA(0.95)\n",
    "\n",
    "# Fit PCA to your data\n",
    "pca.fit(data_std)\n",
    "\n",
    "# Get the number of components\n",
    "num_components = pca.n_components_\n",
    "\n",
    "print(f\"The number of principle components after PCA is {num_components}\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-02-25T02:29:58.267018142Z",
     "start_time": "2024-02-25T02:29:51.084440279Z"
    }
   },
   "id": "7f9bec73a42f7bca",
   "execution_count": 41
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Perform Dimensionality Reduction with PCA\n",
    "\n",
    "We can use the `transform` method of the `PCA` object to project the original data onto the principal components. This will give us the transformed data with the desired number of components."
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "dc9f8c0e194dd07d"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original number of components: 1031\n",
      "Number of components after PCA: 868\n",
      "PCA has successfully reduced the number of components.\n"
     ]
    }
   ],
   "source": [
    "# Project original data to PC with the highest eigenvalue\n",
    "data_pca = pca.transform(data_std)\n",
    "\n",
    "# Create a dataframe with the principal components\n",
    "data_pca_df = pd.DataFrame(data_pca, columns=[f\"PC{i}\" for i in range(1, num_components + 1)])\n",
    "\n",
    "# Print the number of components in the original and PCA transformed data\n",
    "print(f\"Original number of components: {data.shape[1]}\")\n",
    "print(f\"Number of components after PCA: {num_components}\")\n",
    "\n",
    "# Compare the number of components in the original and PCA transformed data\n",
    "if data.shape[1] > num_components:\n",
    "    print(\"PCA has successfully reduced the number of components.\")\n",
    "elif data.shape[1] < num_components:\n",
    "    print(\"Unexpectedly, PCA has increased the number of components.\")\n",
    "else:\n",
    "    print(\"The number of components remains unchanged after PCA.\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-02-25T02:29:59.029369440Z",
     "start_time": "2024-02-25T02:29:58.266576678Z"
    }
   },
   "id": "96c62c50f8734a01",
   "execution_count": 42
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/README.md
+++ b/README.md
@ -0,0 +1,18 @@
 # CSC 3105 Project
 This project involves data preprocessing and dimensionality reduction using Principal Component Analysis (PCA). The data is first standardized to have a mean of 0 and a standard deviation of 1. Then, PCA is applied to reduce the dimensionality of the data while retaining 95% of the original variance.
 ## File Structure
 The project consists of the following files and directories:
 - `Project.ipynb`: This is the main Jupyter notebook file where all the data processing.
 - `./UWB-LOS-NLOS-Data-Set/dataset`: This directory contains the dataset used in the project.
 # Installation and Setup
 ```bash
 git clone <REPO_URL>
 cd <REPO_NAME>
 pip install -r requirements.txt
 ```