NYC HS Graduation Rate Estimator




Pay Notebook Creator: Jendri Morocho0
Set Container: Numerical CPU with TINY Memory for 10 Minutes 0
Total0
In [1]:
#crosscompute
school_table_path = 'schools_test_data.csv'
target_folder = '/tmp'
In [2]:
import subprocess
subprocess.call('pip install -U folium'.split())
Out[2]:
0
In [3]:
#import geopandas
import pysal as ps
import folium
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/lib/weights/util.py:19: UserWarning: geopandas not available. Some functionality will be disabled.
  warn('geopandas not available. Some functionality will be disabled.')
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/pysal/model/spvcm/abstracts.py:10: UserWarning: The `dill` module is required to use the sqlite backend fully.
  from .sqlite import head_to_sql, start_sql
In [4]:
import geotable
import numpy as np
import pandas as pd
In [5]:
import gzip
from invisibleroads_macros.disk import uncompress
from os.path import exists
from urllib.request import urlretrieve

def download(target_path, source_url):
    if not exists(target_path):
        urlretrieve(source_url, target_path)    
    return target_path

def download_zip(target_folder, source_url):
    archive_path = download(target_folder + '.zip', source_url)
    return uncompress(archive_path, target_folder)
            
def download_gz(target_path, source_url):
    archive_path = download(target_path + '.gz', source_url)
    with gzip.open(archive_path, 'rb') as f:
        open(target_path, 'wb').write(f.read())
In [6]:
cig_data = pd.read_excel('cig_data_trimmed_down.xlsx')
In [7]:
cig_data = cig_data[['Business_Name', 'Borough_Code', 'Longitude', 'Latitude']]
cig_data.head()
Out[7]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Business_Name Borough_Code Longitude Latitude
0 NEW STAR TOBACCO INC. 3 -74.007600 40.647730
1 BAJWA FOOD MARKET LLC 4 -73.859122 40.745863
2 SLOPE FOODS, INC. 3 -73.977380 40.680945
3 M & S DELI OF S.I. INC 5 -74.134515 40.625896
4 515 DELI CORP 1 -73.978312 40.741459
In [8]:
#schools = pd.read_csv('Modified_school_data.csv')
schools = pd.read_csv('Modified_school_data.csv', na_values= 's')
schools.head()
Out[8]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Demographic DBN School Name Cohort Total Cohort Total Grads - n Total Grads - % of cohort Total Regents - n Total Regents - % of cohort Total Regents - % of grads ... Local - n Local - % of cohort Local - % of grads Still Enrolled - n Still Enrolled - % of cohort Dropped Out - n Dropped Out - % of cohort Latitude Longitude boro_num
0 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2003 5 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 40.713684 -73.986336 1
1 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2003 5 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 40.713684 -73.986336 1
2 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2003 5 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 40.713684 -73.986336 1
3 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2003 5 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 40.713684 -73.986336 1
4 Total Cohort 01M292 HENRY STREET SCHOOL FOR INTERNATIONAL 2003 5 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 40.713684 -73.986336 1
<p>5 rows × 26 columns</p>
In [9]:
schools_2006 = schools[schools['Cohort']== '2006']
schools_2006 = schools_2006[["Cohort","School Name","Total Cohort","Dropped Out - n","Dropped Out - % of cohort",
        "Total Grads - n","Total Grads - % of cohort",
         "Total Regents - % of cohort","Latitude","Longitude","boro_num"]]
#schools_2006.head()
schools_2006.index = range(len(schools_2006.index))
schools_2006
Out[9]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Cohort School Name Total Cohort Dropped Out - n Dropped Out - % of cohort Total Grads - n Total Grads - % of cohort Total Regents - % of cohort Latitude Longitude boro_num
0 2006 HENRY STREET SCHOOL FOR INTERNATIONAL 78 11.0 14.1 43.0 55.1 46.2 40.713684 -73.986336 1
1 2006 HENRY STREET SCHOOL FOR INTERNATIONAL 78 11.0 14.1 43.0 55.1 46.2 40.713684 -73.986336 1
2 2006 HENRY STREET SCHOOL FOR INTERNATIONAL 78 11.0 14.1 43.0 55.1 46.2 40.713684 -73.986336 1
3 2006 HENRY STREET SCHOOL FOR INTERNATIONAL 78 11.0 14.1 43.0 55.1 46.2 40.713684 -73.986336 1
4 2006 HENRY STREET SCHOOL FOR INTERNATIONAL 78 11.0 14.1 43.0 55.1 46.2 40.713684 -73.986336 1
5 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1
6 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1
7 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1
8 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1
9 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1
10 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1
11 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1
12 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1
13 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1
14 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1
15 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1
16 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1
17 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1
18 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1
19 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1
20 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1
21 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1
22 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1
23 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1
24 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1
25 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1
26 2006 LOWER EAST SIDE PREPARATORY HIGH SCHO 193 35.0 18.1 105.0 54.4 47.2 40.720600 -73.985600 1
27 2006 LOWER EAST SIDE PREPARATORY HIGH SCHO 193 35.0 18.1 105.0 54.4 47.2 40.720600 -73.985600 1
28 2006 LOWER EAST SIDE PREPARATORY HIGH SCHO 193 35.0 18.1 105.0 54.4 47.2 40.720600 -73.985600 1
29 2006 LOWER EAST SIDE PREPARATORY HIGH SCHO 193 35.0 18.1 105.0 54.4 47.2 40.720600 -73.985600 1
... ... ... ... ... ... ... ... ... ... ... ...
2020 2006 BUSHWICK SCHOOL FOR SOCIAL JUSTICE 97 9.0 9.3 74.0 76.3 60.8 40.696970 -73.910791 3
2021 2006 BUSHWICK SCHOOL FOR SOCIAL JUSTICE 97 9.0 9.3 74.0 76.3 60.8 40.696970 -73.910791 3
2022 2006 BUSHWICK SCHOOL FOR SOCIAL JUSTICE 97 9.0 9.3 74.0 76.3 60.8 40.696970 -73.910791 3
2023 2006 BUSHWICK SCHOOL FOR SOCIAL JUSTICE 97 9.0 9.3 74.0 76.3 60.8 40.696970 -73.910791 3
2024 2006 ACADEMY OF URBAN PLANNING 115 10.0 8.7 55.0 47.8 35.7 40.696970 -73.910791 3
2025 2006 ACADEMY OF URBAN PLANNING 115 10.0 8.7 55.0 47.8 35.7 40.696970 -73.910791 3
2026 2006 ACADEMY OF URBAN PLANNING 115 10.0 8.7 55.0 47.8 35.7 40.696970 -73.910791 3
2027 2006 ACADEMY OF URBAN PLANNING 115 10.0 8.7 55.0 47.8 35.7 40.696970 -73.910791 3
2028 2006 ACADEMY OF URBAN PLANNING 115 10.0 8.7 55.0 47.8 35.7 40.696970 -73.910791 3
2029 2006 ACADEMY OF URBAN PLANNING 115 10.0 8.7 55.0 47.8 35.7 40.696970 -73.910791 3
2030 2006 ACADEMY OF URBAN PLANNING 115 10.0 8.7 55.0 47.8 35.7 40.696970 -73.910791 3
2031 2006 ALL CITY LEADERSHIP SECONDARY SCHOOL 43 2.0 4.7 36.0 83.7 53.5 40.697370 -73.913171 3
2032 2006 ALL CITY LEADERSHIP SECONDARY SCHOOL 43 2.0 4.7 36.0 83.7 53.5 40.697370 -73.913171 3
2033 2006 ALL CITY LEADERSHIP SECONDARY SCHOOL 43 2.0 4.7 36.0 83.7 53.5 40.697370 -73.913171 3
2034 2006 ALL CITY LEADERSHIP SECONDARY SCHOOL 43 2.0 4.7 36.0 83.7 53.5 40.697370 -73.913171 3
2035 2006 ALL CITY LEADERSHIP SECONDARY SCHOOL 43 2.0 4.7 36.0 83.7 53.5 40.697370 -73.913171 3
2036 2006 ALL CITY LEADERSHIP SECONDARY SCHOOL 43 2.0 4.7 36.0 83.7 53.5 40.697370 -73.913171 3
2037 2006 BUSHWICK LEADERS HIGH SCHOOL FOR ACAD 117 22.0 18.8 65.0 55.6 23.9 40.695056 -73.928140 3
2038 2006 BUSHWICK LEADERS HIGH SCHOOL FOR ACAD 117 22.0 18.8 65.0 55.6 23.9 40.695056 -73.928140 3
2039 2006 BUSHWICK LEADERS HIGH SCHOOL FOR ACAD 117 22.0 18.8 65.0 55.6 23.9 40.695056 -73.928140 3
2040 2006 BUSHWICK LEADERS HIGH SCHOOL FOR ACAD 117 22.0 18.8 65.0 55.6 23.9 40.695056 -73.928140 3
2041 2006 BUSHWICK LEADERS HIGH SCHOOL FOR ACAD 117 22.0 18.8 65.0 55.6 23.9 40.695056 -73.928140 3
2042 2006 BUSHWICK LEADERS HIGH SCHOOL FOR ACAD 117 22.0 18.8 65.0 55.6 23.9 40.695056 -73.928140 3
2043 2006 BUSHWICK COMMUNITY HIGH SCHOOL 128 52.0 40.6 10.0 7.8 2.3 40.695400 -73.915100 3
2044 2006 BUSHWICK COMMUNITY HIGH SCHOOL 128 52.0 40.6 10.0 7.8 2.3 40.695400 -73.915100 3
2045 2006 BUSHWICK COMMUNITY HIGH SCHOOL 128 52.0 40.6 10.0 7.8 2.3 40.695400 -73.915100 3
2046 2006 BUSHWICK COMMUNITY HIGH SCHOOL 128 52.0 40.6 10.0 7.8 2.3 40.695400 -73.915100 3
2047 2006 BUSHWICK COMMUNITY HIGH SCHOOL 128 52.0 40.6 10.0 7.8 2.3 40.695400 -73.915100 3
2048 2006 BUSHWICK COMMUNITY HIGH SCHOOL 128 52.0 40.6 10.0 7.8 2.3 40.695400 -73.915100 3
2049 2006 BUSHWICK COMMUNITY HIGH SCHOOL 128 52.0 40.6 10.0 7.8 2.3 40.695400 -73.915100 3
<p>2050 rows × 11 columns</p>
In [10]:
'''
dataf = pd.DataFrame(columns=['School', 'boro_num', 'Longitude', 'Latitude', 'Pct_cohort_regents'])
dataf.School =  schools_2006['School Name'].unique()
dataf.Pct_cohort_regents = schools_2006['Total Regents - % of cohort']
''';
In [11]:
schools_2006_v2 = schools_2006.drop_duplicates(subset = ['School Name'])
In [12]:
schools_2006_v2['Total Regents - % of cohort'].unique()
Out[12]:
array([ 46.2,  33.9,  74.4,  47.6,  47.2, 100. ,  40.4,  96.4,  32. ,
        75.5,  65.2,  66.7,  82.6,  56.3,  47.9,  65.6,  47.1,  38. ,
        61.1,  54.8,   nan,  62.2,   0. ,  76.3,  95.2,  85.5,  64.2,
        81.1,  99.2,  92.7,  61.7,  71. ,  40.3,  42.7,  55.1,  98.3,
        63.3,  58.5,  87.5,  42.1,  51.4,  59.2,  56.8,  42.6,   6.9,
        88.9,  30.9,  54.5,  41.7,  41.3,   7.9,  53. ,  23.3,  77.9,
        79.2,  31.2,  59.7,  60.2,  43. ,  20.5,  91.8,  96.1,  48.9,
        16. ,  93.3,  57.4,  39.5,  84. ,  31.5,  34.7,   6.2,  67.7,
        79.9,  58.9,  95.3,  54.6,  49.7,  56.7,  32.3,  64.8,  53.8,
        63.8,  51.9,  33.3,  80.3,  43.4,  54.9,  22.2,  61. ,  63.2,
        20.2,  65.3,  49.6,  32.7,  40.7,   1.3,  35.3,  31.3,  25. ,
        47. ,  55. ,  75.7,  77. ,  53.3,  49.3,  36.4,  31.8,  29.6,
        34.5,  70.9,  51.7,  52.8,  68.1,  60.3,  79. ,  53.7,  38.9,
        50.5,  18.6,  45.8,  32.2,  41.8,  61.5,  50. ,  48.6,  39.2,
        98.4,  86. ,  58.3,  97.4,  96.6,  33. ,  66.9,  53.5,  40.5,
        48.4,  58.1,  19.7,  76.6,  41.5,  58.6,  38.5,  14. ,  34.6,
        49.1,  38.4,  55.6,  52.1,  46.8,  91.4,  81.7,  71.6,   3.6,
        27.6,   2.3,  88. ,  27. ,  43.3,  92.2,  46.9,  47.8,  64. ,
        61.3,  72.2,  81.5,  23.6,  63.4,  81.6,  39.3,  35.6,  43.1,
         7.5,  57. ,   5.5,  31.9,  73.9,  45.2,  40. ,  42. ,  61.2,
        93.4,  45.5,   6.7,  91.7,  42.8,   8.9,  65.9,  29.9,  45.9,
        10.8,   6.3,  37.7,  22.1,  53.6,  72.4,  57.1,  15.5,  59.3,
        66.2,  70.7,  33.5,  56.4,  81.3,  59.4,  94.9,  90. ,  42.9,
        12.4,   6. ,  11.3,  60.6,  87.8,  67.3,  96. ,  36.1,  43.2,
        57.9,  71.2,   7.1,  44.2,  77.3,  80.5,  13.5,  52.2,  70.5,
        65.4,  50.1,  77.1,  65.1,  55.8,  83.1,  26.4,  82.5,  40.1,
        78. ,  92.3,  54.7,  67.2,  73.8,  64.9,  53.9,  57.5,  95.8,
        82.3,  59.1,  68.5,  63.6,  15.4,  99.7,  29.8,  60.8,  35.7,
        23.9])
In [13]:
schools_2006_v3 = schools_2006_v2.dropna().copy()
In [14]:
#schools_2006_v3.dtypes
In [15]:
#from geopy.distance import geodesic as get_distance
In [16]:
'''
get_distance
newport_ri = (41.49008, -71.312796)
cleveland_oh = (41.499498, -81.695391)
x = get_distance(newport_ri, cleveland_oh)
''';
In [17]:
'''
for index,row in schools_2006_v3.iterrows():
    rad = .3
    school_loc = row['Latitude'], row['Longitude']
    cig_count = 0
    
    for cig_index,cig_row in cig_data[cig_data['Borough_Code']==row['boro_num']].iterrows():
        cig_loc = cig_row['Latitude'], cig_row['Longitude']
        dist = get_distance(school_loc,cig_loc).miles
        if(dist <= rad ):
            cig_count += 1
        
    schools_2006_v3['num_stores'] = cig_count
''';
In [18]:
'''
stores_xys = []
for index, row in cig_data.iterrows():
    stores_xys.append((row['Longitude'], row['Latitude']))
stores_xys[:5]''';
In [19]:
store_xys = cig_data[['Longitude', 'Latitude']].values
In [20]:
test_data = pd.read_csv(school_table_path) # Use a given file of number of stores near a school
test_data.head()
Out[20]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
number_of_stores
0 12
1 14
2 20
3 70
4 89
In [21]:
# Make school kdtree
# from pysal.cg.kdtree import KDTree
# from pysal.cg import RADIUS_EARTH_MILES

# store_tree = KDTree(store_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)
In [22]:
# NEW
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM
from pysal.lib.weights import KNN
store_tree = KDTree(np.array(store_xys), distance_metric='Arc', radius=RADIUS_EARTH_KM)

#w = KNN(store_tree, k=2)
#w.set_transform('R')
In [25]:
radius_in_miles = 0.4
store_count = len(cig_data)
def get_store_count(r):
    xy = r['Longitude'], r['Latitude']
    distances, indices = store_tree.query(
        xy, k=store_count, distance_upper_bound=radius_in_miles)
    return sum(indices < store_count)


schools_2006_v3['nearby_store_count'] = schools_2006_v3.apply(
    get_store_count, axis=1)
In [26]:
schools_2006_v3.head() #nearby store count needs to be changed
Out[26]:
<style scoped=""> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
Cohort School Name Total Cohort Dropped Out - n Dropped Out - % of cohort Total Grads - n Total Grads - % of cohort Total Regents - % of cohort Latitude Longitude boro_num nearby_store_count
0 2006 HENRY STREET SCHOOL FOR INTERNATIONAL 78 11.0 14.1 43.0 55.1 46.2 40.713684 -73.986336 1 15
5 2006 UNIVERSITY NEIGHBORHOOD HIGH SCHOOL 124 20.0 16.1 53.0 42.7 33.9 40.712399 -73.984497 1 8
12 2006 EAST SIDE COMMUNITY SCHOOL 90 5.0 5.6 70.0 77.8 74.4 40.729589 -73.982555 1 56
19 2006 MARTA VALLE HIGH SCHOOL 84 5.0 6.0 47.0 56.0 47.6 40.720581 -73.985645 1 68
26 2006 LOWER EAST SIDE PREPARATORY HIGH SCHO 193 35.0 18.1 105.0 54.4 47.2 40.720600 -73.985600 1 68
In [27]:
X = schools_2006_v3[['nearby_store_count']].values
In [28]:
y = schools_2006_v3['Total Grads - % of cohort'].values
In [29]:
from sklearn.svm import SVR
model = SVR()
model.fit(X, y)
/home/user/.virtualenvs/crosscompute/lib/python3.6/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
Out[29]:
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)
In [30]:
test_data.columns
X_test = test_data[['number_of_stores']].values
In [31]:
#X_test['graduation_rate']=model.predict(X_test)
test_data['graduation_rate'] = model.predict(X_test)
In [32]:
test_data['graduation_rate']
Out[32]:
0    64.617948
1    66.800153
2    65.799808
3    68.433029
4    66.837539
5    66.837539
6    67.469660
7    67.469660
8    67.469660
Name: graduation_rate, dtype: float64
In [33]:
from os.path import join
target_path = join(target_folder,'updated_table.csv')
test_data.to_csv(target_path,index = False)
print('graduation_table_path = %s' % target_path)
graduation_table_path = /tmp/updated_table.csv