initial commit of files

tulimid1 · May 14, 2022 · 982eae7 · 982eae7
1 parent b617ae1
commit 982eae7
Show file tree

Hide file tree

Showing 54 changed files with 2,024 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/Code/.DS_Store b/Code/.DS_Store
diff --git a/Code/Blackard_Dean_99.ipynb b/Code/Blackard_Dean_99.ipynb
diff --git a/Code/CoverTypeProject.yml b/Code/CoverTypeProject.yml
@@ -0,0 +1,161 @@
+name: math637
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - appnope=0.1.2=py38h50d1736_2
+  - argon2-cffi=20.1.0=py38h9ed2024_1
+  - asttokens=2.0.5=pyhd8ed1ab_0
+  - attrs=21.4.0=pyhd3eb1b0_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=py_2
+  - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
+  - black=22.1.0=pyhd8ed1ab_0
+  - blas=1.0=mkl
+  - bleach=4.1.0=pyhd3eb1b0_0
+  - bottleneck=1.3.2=py38hf1fa96c_1
+  - brotli=1.0.9=hb1e8313_2
+  - brotlipy=0.7.0=py38h96a0964_1003
+  - ca-certificates=2022.3.29=hecd8cb5_1
+  - certifi=2021.10.8=py38hecd8cb5_2
+  - cffi=1.15.0=py38hc55c11b_1
+  - charset-normalizer=2.0.11=pyhd8ed1ab_0
+  - click=8.0.3=py38h50d1736_1
+  - cryptography=36.0.1=py38h56c4533_0
+  - cycler=0.11.0=pyhd3eb1b0_0
+  - dataclasses=0.8=pyhc8e2a94_3
+  - debugpy=1.5.1=py38ha048514_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - defusedxml=0.7.1=pyhd3eb1b0_0
+  - docopt=0.6.2=py_1
+  - entrypoints=0.4=pyhd8ed1ab_0
+  - executing=0.8.2=pyhd8ed1ab_0
+  - fonttools=4.25.0=pyhd3eb1b0_0
+  - freetype=2.11.0=hd8bbffd_0
+  - giflib=5.2.1=haf1e3a3_0
+  - greenlet=1.1.1=py38h23ab428_0
+  - idna=3.3=pyhd8ed1ab_0
+  - importlib-metadata=4.8.2=py38hecd8cb5_0
+  - importlib_metadata=4.8.2=hd3eb1b0_0
+  - intel-openmp=2021.4.0=hecd8cb5_3538
+  - ipykernel=6.9.0=py38h5fd9f69_0
+  - ipympl=0.8.7=pyhd3eb1b0_0
+  - ipython=8.0.1=py38h50d1736_0
+  - ipython_genutils=0.2.0=pyhd3eb1b0_1
+  - ipywidgets=7.6.5=pyhd3eb1b0_1
+  - jedi=0.18.1=py38h50d1736_0
+  - jinja2=3.0.2=pyhd3eb1b0_0
+  - jpeg=9d=h9ed2024_0
+  - jsonschema=3.2.0=pyhd3eb1b0_2
+  - jupyter_client=7.1.2=pyhd8ed1ab_0
+  - jupyter_core=4.9.1=py38h50d1736_1
+  - jupyterlab_pygments=0.1.2=py_0
+  - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
+  - kiwisolver=1.3.1=py38h23ab428_0
+  - lcms2=2.12=hf1fd2bf_0
+  - libcxx=12.0.0=h2f01273_0
+  - libffi=3.3=hb1e8313_2
+  - libgfortran=3.0.1=h93005f0_2
+  - libllvm11=11.1.0=h9b2ccf5_0
+  - libpng=1.6.37=ha441bb4_0
+  - libsodium=1.0.18=hbcb3906_1
+  - libtiff=4.2.0=h87d7836_0
+  - libwebp=1.2.0=hacca55c_0
+  - libwebp-base=1.2.0=h9ed2024_0
+  - llvm-openmp=12.0.0=h0dcd299_1
+  - llvmlite=0.37.0=py38he4411ff_1
+  - lz4-c=1.9.3=h23ab428_1
+  - markupsafe=2.0.1=py38h9ed2024_0
+  - matplotlib=3.5.0=py38hecd8cb5_0
+  - matplotlib-base=3.5.0=py38h4f681db_0
+  - matplotlib-inline=0.1.3=pyhd8ed1ab_0
+  - mistune=0.8.4=py38h1de35cc_1001
+  - mkl=2021.4.0=hecd8cb5_637
+  - mkl-service=2.4.0=py38h9ed2024_0
+  - mkl_fft=1.3.1=py38h4ab4a9b_0
+  - mkl_random=1.2.2=py38hb2f4e1b_0
+  - munkres=1.1.4=py_0
+  - mypy_extensions=0.4.3=py38h50d1736_4
+  - nbclient=0.5.11=pyhd3eb1b0_0
+  - nbconvert=6.3.0=py38hecd8cb5_0
+  - nbformat=5.1.3=pyhd3eb1b0_0
+  - ncurses=6.3=hca72f7f_2
+  - nest-asyncio=1.5.4=pyhd8ed1ab_0
+  - notebook=6.4.8=py38hecd8cb5_0
+  - numba=0.54.1=py38hae1ba45_0
+  - numexpr=2.8.1=py38h2e5f0a9_0
+  - numpy=1.20.3=py38h4b4dc7a_0
+  - numpy-base=1.20.3=py38he0bd621_0
+  - olefile=0.46=pyhd3eb1b0_0
+  - openssl=1.1.1n=hca72f7f_0
+  - packaging=21.3=pyhd3eb1b0_0
+  - pandas=1.3.5=py38h743cdd8_0
+  - pandocfilters=1.5.0=pyhd3eb1b0_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - pathspec=0.9.0=pyhd8ed1ab_0
+  - patsy=0.5.2=py38hecd8cb5_1
+  - pexpect=4.8.0=pyh9f0ad1d_2
+  - pickleshare=0.7.5=py_1003
+  - pillow=8.4.0=py38h98e4679_0
+  - pip=21.2.4=py38hecd8cb5_0
+  - pipreqs=0.4.10=py_0
+  - platformdirs=2.5.0=pyhd8ed1ab_0
+  - plotly=5.6.0=pyhd3eb1b0_0
+  - prometheus_client=0.13.1=pyhd3eb1b0_0
+  - prompt-toolkit=3.0.27=pyha770c72_0
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pure_eval=0.2.2=pyhd8ed1ab_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pygments=2.11.2=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.4=pyhd3eb1b0_0
+  - pyrsistent=0.18.0=py38hca72f7f_0
+  - pysocks=1.7.1=py38h50d1736_4
+  - python=3.8.10=h88f2d9e_7
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.8=2_cp38
+  - pyzmq=22.3.0=py38hd3b92b6_1
+  - readline=8.1.2=hca72f7f_1
+  - requests=2.27.1=pyhd8ed1ab_0
+  - scipy=1.7.3=py38h8c7af03_0
+  - seaborn=0.11.2=pyhd3eb1b0_0
+  - send2trash=1.8.0=pyhd3eb1b0_1
+  - setuptools=58.0.4=py38hecd8cb5_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sqlalchemy=1.4.32=py38hca72f7f_0
+  - sqlite=3.37.2=h707629a_0
+  - stack_data=0.1.4=pyhd8ed1ab_0
+  - statsmodels=0.12.2=py38h9ed2024_0
+  - tbb=2021.5.0=haf03e11_0
+  - tenacity=8.0.1=py38hecd8cb5_0
+  - terminado=0.13.1=py38hecd8cb5_0
+  - testpath=0.5.0=pyhd3eb1b0_0
+  - tk=8.6.11=h7bc2e8c_0
+  - tomli=2.0.1=pyhd8ed1ab_0
+  - tornado=6.1=py38h96a0964_2
+  - traitlets=5.1.1=pyhd8ed1ab_0
+  - typed-ast=1.5.2=py38h96a0964_0
+  - typing_extensions=4.1.1=pyha770c72_0
+  - urllib3=1.26.8=pyhd8ed1ab_1
+  - wcwidth=0.2.5=pyh9f0ad1d_2
+  - webencodings=0.5.1=py38_1
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - widgetsnbextension=3.5.2=py38hecd8cb5_0
+  - xz=5.2.5=h1de35cc_0
+  - yarg=0.1.9=py_1
+  - zeromq=4.3.4=he49afe7_1
+  - zipp=3.7.0=pyhd3eb1b0_0
+  - zlib=1.2.11=h4dc903c_4
+  - zstd=1.4.9=h322a384_0
+  - pip:
+    - datetime==4.4
+    - imageio==2.16.0
+    - joblib==1.1.0
+    - pytz==2021.3
+    - savingfigr==1.0.3
+    - savingfigures==1.0.3
+    - scikit-learn==1.0.2
+    - sklearn==0.0
+    - threadpoolctl==3.1.0
+    - zope-interface==5.4.0
+prefix: /Users/Semrau_Lab/opt/anaconda3/envs/math637
diff --git a/Code/KNN_param_search.ipynb b/Code/KNN_param_search.ipynb
diff --git a/Code/LDA_param_search.ipynb b/Code/LDA_param_search.ipynb
diff --git a/Code/LogisticReg_param_search.ipynb b/Code/LogisticReg_param_search.ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Logistic Regression parameter search\n",
+    "\n",
+    "Duncan Tulimieri"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import libraries \n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "plt.rcParams.update({'font.size': 16}) \n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "import warnings \n",
+    "warnings.filterwarnings('ignore')\n",
+    "import time \n",
+    "import seaborn as sns\n",
+    "# personal classes\n",
+    "from ProcessData import ProcessForestData\n",
+    "import savingfigR as sf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class pLogisticRegression(ProcessForestData):\n",
+    "    \n",
+    "    penalty_options = ['l1', 'l2', 'elasticnet', 'none']\n",
+    "    C_options = np.linspace(0.01, 1, 5)\n",
+    "    intercept_options = [True, False]\n",
+    "    l1_ratio_options = np.linspace(0, 1, 5)\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        # method calls \n",
+    "        self.X_train, self.X_test, self.y_train, self.y_test = self.load_data(perform_scale=True, sub_data_section='')\n",
+    "        self.un_classifiers = np.unique(self.y_train)\n",
+    "        # Raw data \n",
+    "        start = time.time()\n",
+    "        LR_trained_opt = self.optimize_LogisticRegression_params(self.X_train, self.y_train, self.penalty_options, self.C_options, self.intercept_options, self.l1_ratio_options)\n",
+    "        LR_score = self.score_LogisticRegression(LR_trained_opt, self.X_test, self.y_test)\n",
+    "        end = time.time()\n",
+    "        print(f'Raw data LogisticRegression optimal score = {LR_score}')\n",
+    "        print(f'Time taken = {end-start}')\n",
+    "\n",
+    "    # model\n",
+    "    def train_LogisticRegression(self, X, y, penalty, C, fit_B0, l1_ratio):\n",
+    "        if penalty == 'elasticnet':\n",
+    "            return LogisticRegression(penalty=penalty, C=C, fit_intercept=fit_B0, l1_ratio=l1_ratio, n_jobs=4, solver='saga').fit(X, y)\n",
+    "        else: \n",
+    "            return LogisticRegression(penalty=penalty, C=C, fit_intercept=fit_B0, n_jobs=4, solver='saga').fit(X, y)\n",
+    "\n",
+    "    def score_LogisticRegression(self, trained_LogisticRegression_model, X_test, y_test):\n",
+    "        return trained_LogisticRegression_model.score(X_test, y_test)\n",
+    "\n",
+    "    def predict_LogisticRegression(self, trained_LogisticRegression_model, X_test):\n",
+    "        return trained_LogisticRegression_model.predict(X_test)\n",
+    "\n",
+    "    def optimize_LogisticRegression_params(self, X_train, y_train, penalty_options=penalty_options, C_options=C_options, intercept_options=intercept_options, l1_ratio_options=l1_ratio_options, cv=10, scoring='accuracy'):\n",
+    "        LogisticRegression_raw = LogisticRegression()\n",
+    "        cv_train_model = GridSearchCV(LogisticRegression_raw, param_grid={'penalty':penalty_options, 'C': C_options, 'fit_intercept':intercept_options, 'l1_ratio':l1_ratio_options}, cv=cv, scoring=scoring).fit(X_train, y_train)\n",
+    "        print(f'Best LogisticRegression parameters: penalty = {cv_train_model.best_params_[\"penalty\"]}, C = {cv_train_model.best_params_[\"C\"]}, fit_intercept = {cv_train_model.best_params_[\"fit_intercept\"]}, l1_ratio = {cv_train_model.best_params_[\"l1_ratio\"]}')\n",
+    "        best_model = self.train_LogisticRegression(X_train, y_train, penalty=cv_train_model.best_params_[\"penalty\"], C=cv_train_model.best_params_[\"C\"], fit_B0=cv_train_model.best_params_[\"fit_intercept\"], l1_ratio=cv_train_model.best_params_[\"l1_ratio\"])\n",
+    "        return best_model    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best LogisticRegression parameters: penalty = l2, C = 0.7525, fit_intercept = True, l1_ratio = 0.0\n",
+      "Raw data LogisticRegression optimal score = 0.7166304078429213\n",
+      "Time taken = 14484.176457881927\n"
+     ]
+    }
+   ],
+   "source": [
+    "testLR = pLogisticRegression()"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "e5ced7bbea2155d302b976f4184419b8d40f50030e781605408c0dc76f430f24"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('math637')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}