NYC HS Graduation Rate Estimator




Pay Notebook Creator: Jendri Morocho0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
# crosscompute
Test_Data_Path = 'Testing_Data.csv'
target_folder = '/tmp'
In [2]:
import gzip
from invisibleroads_macros.disk import uncompress
from os.path import exists
from urllib.request import urlretrieve

def download(target_path, source_url):
    if not exists(target_path):
        urlretrieve(source_url, target_path)    
    return target_path

def download_zip(target_folder, source_url):
    archive_path = download(target_folder + '.zip', source_url)
    return uncompress(archive_path, target_folder)
            
def download_gz(target_path, source_url):
    archive_path = download(target_path + '.gz', source_url)
    with gzip.open(archive_path, 'rb') as f:
        open(target_path, 'wb').write(f.read())
In [3]:
import subprocess
subprocess.call('pip install -U seaborn'.split())
Out[3]:
0
In [4]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
schools_2006 = pd.read_csv("Clean School Data 2006.csv")
In [6]:
X = schools_2006[['Total Cohort','nearby_store_count','Advanced Regents - n',"boro_num1","boro_num2","boro_num3","boro_num4","boro_num5"]].values
In [7]:
y = schools_2006['Total Grads - n'].values

Train Test Split

In [8]:
from sklearn.model_selection import train_test_split
In [9]:
X_train,X_test,y_train,y_test  = train_test_split( X, y , test_size = 0.4, random_state = 101)
In [10]:
print(len(X_train),len(X_test))
179 120

Creating and Testing the Model (OLS-Linear Model)

In [11]:
from sklearn.linear_model import LinearRegression
In [12]:
lm = LinearRegression()
lm.fit(X_train,y_train)
Out[12]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
In [13]:
predictions = lm.predict(X_test)
In [14]:
plt.scatter(y_test,predictions)
Out[14]:
<matplotlib.collections.PathCollection at 0x7f61cc9aada0>
In [15]:
sns.distplot((y_test-predictions),bins=50);
In [16]:
from sklearn import metrics
from sklearn.metrics import r2_score
In [17]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("R^2:", r2_score(y_test, predictions))
MAE: 19.69785437491135
MSE: 861.6196757743496
RMSE: 29.35335884995701
R^2: 0.9659457703829093

Testing User input csv

In [18]:
test_data = pd.read_csv(Test_Data_Path) # Use a given file of number of stores near a school
test_data_v2 = test_data[['Total Cohort','nearby_store_count','Advanced Regents - n',"boro_num1","boro_num2","boro_num3","boro_num4","boro_num5"]]
y_test_data = test_data[['Total Grads - n']]
In [19]:
User_X_test = test_data_v2[np.array(test_data_v2.columns)].values
In [20]:
test_data_v2["(Predicted)Total Grads - n"] = np.array(lm.predict(User_X_test)).astype(int)
test_data_v2["Total Grads - n"] = y_test_data.values
test_data_v2['Difference in results'] = abs(test_data_v2["(Predicted)Total Grads - n"] - test_data_v2["Total Grads - n"])
In [21]:
from os.path import join
target_path = join(target_folder,'updated_table.csv')
test_data_v2.to_csv(target_path,index = False)
print('graduation_table_path = %s' % target_path)
graduation_table_path = /tmp/updated_table.csv