{ "cells": [ { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "## Python Version - 3\n", "## Scikit Learn Version - \n", "import pandas as pd\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.feature_extraction import DictVectorizer\n", "from sklearn.svm import SVC\n", "from sklearn.cluster import KMeans\n", "from sklearn import decomposition\n", "from sklearn import metrics\n", "from sklearn import datasets\n", "#import seaborn as sb\n", "from sklearn.metrics import silhouette_samples, silhouette_score\n", "from sklearn import preprocessing\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_squared_error\n", "\n", "def encode_onehot(df, cols):\n", " vec = DictVectorizer()\n", " \n", " vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())\n", " vec_data.columns = vec.get_feature_names()\n", " vec_data.index = df.index\n", " \n", " df = df.drop(cols, axis=1)\n", " df = df.join(vec_data)\n", " return df\n", "\n", "def encode_label(df, cols):\n", " le = preprocessing.LabelEncoder()\n", " \n", " le_data = pd.DataFrame(le.fit_transform(df[cols]))\n", " le_data.columns = cols\n", " le_data.index = df.index\n", " \n", " le_data.to_csv(path_or_buf='C:/ct_gov_condition_label.csv')\n", " \n", " return df\n", "\n", "def getAgeInWeeks(x):\n", " if (pd.isnull(x)):\n", " return 0\n", " elif (x.endswith('Years')):\n", " return 52 * int(x.rstrip('Years '))\n", " elif (x.endswith('Months')):\n", " return 4.5 * int(x.rstrip('Months '))\n", " elif (x.endswith('Weeks')):\n", " return int(x.rstrip('Weeks '))\n", " elif (x.endswith('Days')):\n", " return int(x.rstrip('Days ')) / 7\n", " \n", " return x\n", "\n", "def getSignificance(x):\n", " \n", " x=float(x)\n", " if (pd.isnull(x)):\n", " return 0\n", " elif (x > 0 and x<=.02):\n", " return 1\n", " elif (x > .02 and x<=.03):\n", " return 2\n", " elif (x > .03 and x<=.04):\n", " return 3\n", " elif (x > .04 and x<=.05):\n", " return 4\n", " elif (x > .05 and x<=.1):\n", " return 5\n", " elif (x > .1 and x<=.3):\n", " return 6 \n", " elif (x > .3 and x<=.5):\n", " return 7 \n", " elif (x > .5 and x<=.7):\n", " return 8 \n", " elif (x > .7):\n", " return 9 \n", " \n", " return x\n", "\n", "def getNoDefined(x):\n", " if (pd.isnull(x)):\n", " return 'NotDefined'\n", " \n", " return x\n", " \n", "def getZero(x):\n", " if (pd.isnull(x)):\n", " return 0\n", " \n", " return x\n", " \n", "\n", "\n", "\n", "def getEnrollmentdf(df):\n", " # Lets use the features that are Study Attribute and derived Enrollment. \n", " one_hot_reduced = df[['PHASE','STUDY_TYPE','NUMBER_OF_ARMS','GENDER','CONDITION','INTERVENTION_TYPE','ENDPOINT_CLASSIFICATION','INTERVENTION_MODEL','OBSERVATION_MODEL','MASKING','TIME_PERSPECTIVE','PRIMARY_PURPOSE','ENROLLMENT']] \n", "\n", " # Encoding the text, note that \"Condition\" has been encoded to \n", " one_hot_xform = encode_onehot(one_hot_reduced, [ 'PHASE', 'STUDY_TYPE', 'GENDER','INTERVENTION_TYPE','ENDPOINT_CLASSIFICATION','INTERVENTION_MODEL','OBSERVATION_MODEL','MASKING','TIME_PERSPECTIVE','PRIMARY_PURPOSE'])\n", " one_hot_xform.fillna(0, inplace=True)\n", " \n", " return one_hot_xform\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | OVERALL_STATUS | \n", "COMPLETION_DATE_TYPE | \n", "PHASE | \n", "STUDY_TYPE | \n", "NUMBER_OF_ARMS | \n", "NUMBER_OF_GROUPS | \n", "ENROLLMENT_TYPE | \n", "ENROLLMENT | \n", "GENDER | \n", "MINIMUM_AGE | \n", "... | \n", "NUMBER_OF_INVESTIGATORS | \n", "PARTFLOW_COUNT_STARTED | \n", "PARTFLOW_COUNT_NOT_COMPLETED | \n", "PARTFLOW_COUNT_COMPLETED | \n", "INTERVENTION_MODEL | \n", "OBSERVATION_MODEL | \n", "MASKING | \n", "TIME_PERSPECTIVE | \n", "PRIMARY_PURPOSE | \n", "ENDPOINT_CLASSIFICATION | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
NCT_ID | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
NCT00000378 | \n", "Completed | \n", "Actual | \n", "Phase 4 | \n", "Interventional | \n", "2.0 | \n", "0.0 | \n", "Actual | \n", "110 | \n", "Both | \n", "3120 | \n", "... | \n", "0 | \n", "1636 | \n", "477 | \n", "1159 | \n", "Parallel Assignment | \n", "NotDefined | \n", "Double Blind | \n", "NotDefined | \n", "Treatment | \n", "Efficacy Study | \n", "
NCT00001656 | \n", "Completed | \n", "Actual | \n", "Phase 4 | \n", "Interventional | \n", "2.0 | \n", "0.0 | \n", "Actual | \n", "25 | \n", "Both | \n", "364 | \n", "... | \n", "0 | \n", "1636 | \n", "477 | \n", "1159 | \n", "Parallel Assignment | \n", "NotDefined | \n", "Double Blind | \n", "NotDefined | \n", "Treatment | \n", "Efficacy Study | \n", "
NCT00004859 | \n", "Terminated | \n", "Actual | \n", "Phase 3 | \n", "Interventional | \n", "2.0 | \n", "0.0 | \n", "Actual | \n", "589 | \n", "Both | \n", "936 | \n", "... | \n", "0 | \n", "1636 | \n", "477 | \n", "1159 | \n", "Parallel Assignment | \n", "NotDefined | \n", "Open Label | \n", "NotDefined | \n", "Treatment | \n", "Efficacy Study | \n", "
NCT00001959 | \n", "Completed | \n", "Actual | \n", "Phase 2 | \n", "Interventional | \n", "1.0 | \n", "0.0 | \n", "Actual | \n", "21 | \n", "Both | \n", "936 | \n", "... | \n", "0 | \n", "1636 | \n", "477 | \n", "1159 | \n", "Single Group Assignment | \n", "NotDefined | \n", "Open Label | \n", "NotDefined | \n", "Treatment | \n", "Safety/Efficacy Study | \n", "
NCT00003222 | \n", "Completed | \n", "NaN | \n", "Phase 2 | \n", "Interventional | \n", "2.0 | \n", "0.0 | \n", "Actual | \n", "40 | \n", "Both | \n", "936 | \n", "... | \n", "0 | \n", "1636 | \n", "477 | \n", "1159 | \n", "Parallel Assignment | \n", "NotDefined | \n", "Open Label | \n", "NotDefined | \n", "Treatment | \n", "Efficacy Study | \n", "
5 rows × 33 columns
\n", "