# Introduction to Computational Analysis

 Pay Notebook Creator: Roy Hyunjin Han 0 Set Container: Numerical CPU with TINY Memory for 10 Minutes 0 Total 0
In [1]:
In [2]:
for table in alcohol, chocolate, marriage:
table.rename(columns={
'Country or Area': 'Country',
'Comm. Code': 'CommodityCode',
}, inplace=True)
In [3]:
alcohol = alcohol[alcohol['CommodityCode'] >= 220300][['Country', 'CommodityCode', 'Commodity', 'Flow', 'Quantity']]
chocolate = chocolate[chocolate['CommodityCode'] >= 180620][['Country', 'CommodityCode', 'Commodity', 'Flow', 'Quantity']]
marriage = marriage[['Country', 'Subgroup', 'Value']]
In [4]:
alcohol.ix[alcohol.index[0]]
In [5]:
alcohol[['CommodityCode', 'Commodity']].drop_duplicates('CommodityCode').sort('CommodityCode')
In [6]:
chocolate.ix[chocolate.index[0]]
In [7]:
chocolate[['CommodityCode', 'Commodity']].drop_duplicates('CommodityCode').sort('CommodityCode')
In [8]:
marriage.ix[marriage.index[0]]
In [9]:
marriage.groupby('Subgroup').mean()
In [10]:
marriagePivot = marriage.pivot('Country', 'Subgroup', 'Value')
marriagePivot
In [11]:
from pandas import Series

def compute_datasetRow(row):
country = row.name
alcoholFlow = sum_by_flow(alcohol)
chocolateFlow = sum_by_flow(chocolate)
return Series(dict(
AlcoholImported=alcoholFlow.get('Import', 0),
AlcoholExported=alcoholFlow.get('Export', 0),
ChocolateImported=chocolateFlow.get('Import', 0),
ChocolateExported=chocolateFlow.get('Export', 0),
MarriageAgeFemale=row['Female'],
MarriageAgeMale=row['Male']), name=country)

dataset = marriagePivot.apply(compute_datasetRow, axis=1)
In [12]:
dataset.AlcoholImported.idxmax()
In [13]:
dataset.AlcoholImported.order(ascending=False)[:5]
In [14]:
import numpy as np
from pandas import DataFrame
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

featureColumns = [_ for _ in dataset.columns if not _.startswith('Marriage')]

def score_model(model, targetColumn):
pruned = dataset[featureColumns + [targetColumn]].dropna()
return np.mean(cross_val_score(model, pruned[featureColumns], pruned[targetColumn]))

def score_models(models):
modelNames = [model.__class__.__name__ for model in models]
results = []
for model in models:
results.append([
score_model(model, 'MarriageAgeFemale'),
score_model(model, 'MarriageAgeMale'),
])
return DataFrame(results, index=modelNames, columns=['Female', 'Male'])
In [15]:
score_models([
LinearRegression(),
SVR(),
Pipeline([
('StandardScaler', StandardScaler()),
('Model', SVR()),
]),
])
In [16]:
from sklearn.feature_selection import RFECV

def rank_features(model, targetColumn):
pruned = dataset[featureColumns + [targetColumn]].dropna()
featureSelector = RFECV(model)
featureSelector.fit(pruned[featureColumns], pruned[targetColumn])
return sorted(zip(featureSelector.ranking_, featureColumns))
In [17]:
rank_features(LinearRegression(), 'MarriageAgeFemale')
In [18]:
rank_features(LinearRegression(), 'MarriageAgeMale')