# Introduction to Computational Analysis

 Pay Notebook Creator: Roy Hyunjin Han 0 Set Container: Numerical CPU with TINY Memory for 10 Minutes 0 Total 0

The goal is to build a machine that make decisions automatically using information it has not seen before, and whose performance improves with experience. The approach in machine learning is to develop algorithms that make decisions using a model fitted on data.

# Machine learning is easy with Scikit-Learn¶

The scikit-learn package is a collection of machine learning algorithms that share a common usage pattern:

• Pick model.
• Fit model parameters to data.
• Predict using fitted model.
In [2]:
from sklearn import datasets, neighbors
model = neighbors.KNeighborsClassifier()
model.fit(iris.data, iris.target)
model.predict([7.5, 3, 6.5, 2.1])

In [ ]:
# Take a moment to browse the official tutorials and examples


## Which model do we use?¶

In [10]:
from sklearn.datasets import load_digits
X, y = digits.data, digits.target
trainingSet = X[:-100], y[:-100]
testSet = X[-100:], y[-100:]

def evaluate_model(model):
return model.fit(*trainingSet).score(*testSet)

In [14]:
from sklearn.gaussian_process import GaussianProcess
evaluate_model(GaussianProcess())

In [17]:
from sklearn.tree import DecisionTreeClassifier
evaluate_model(DecisionTreeClassifier())

In [22]:
from sklearn.svm import SVC
evaluate_model(SVC(kernel='linear', C=0.001))


## Evaluate model performance with cross-validation¶

In [26]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
cross_val_score(model, iris.data, iris.target)

In [29]:
cross_val_score(model, iris.data, iris.target, cv=4)

In [31]:
from sklearn.cross_validation import LeaveOneOut

cross_val_score(model, iris.data, iris.target, cv=LeaveOneOut(len(iris.target)))


## Evaluate stack performance with pipelining¶

In [23]:
from IPython.lib.display import YouTubeVideo

In [42]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

model = Pipeline([
('pca', PCA()),
('logistic', LogisticRegression()),
])
np.mean(cross_val_score(model, digits.data, digits.target))


## Transform data¶

In [44]:
from sklearn.preprocessing import StandardScaler

model = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('logistic', LogisticRegression()),
])
np.mean(cross_val_score(model, digits.data, digits.target))


Let's vectorize a stanza from Zbigniew Herbert's A Knocker.

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)
documents = [
'my imagination',
'is a piece of board',
'my sole instrument',
'is a wooden stick',
]
X = vectorizer.fit_transform(documents)
documentVectors = X.toarray()
documentVectors

In [52]:
featureNames = vectorizer.get_feature_names()
for bagOfWords in documentVectors:
print zip(featureNames, bagOfWords)

In [70]:
# Adapted from
# http://scikit-learn.org/dev/tutorial/statistical_inference/putting_together.html
from sklearn import linear_model, decomposition, datasets
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
('pca', decomposition.PCA()),
('logistic', linear_model.LogisticRegression()),
])
gridSearch = GridSearchCV(pipeline, dict(
pca__n_components=[20, 40],
logistic__C=[1, 1000]))

gridSearch.fit(digits.data, digits.target)
valueByParameter = gridSearch.best_estimator_.get_params()
for parameter in gridSearch.param_grid:
print '%s: %r' % (parameter, valueByParameter[parameter])


## Identify a translator of Zbigniew Herbert¶

In [93]:
from archiveIO import Archive, TemporaryFolder

archive = Archive('datasets/ZbigniewHerbert.tar.gz')
documents = []
categories = []
with TemporaryFolder() as temporaryFolder:
documents.append(text)
categories.append('Carpenter' in text)

In [94]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])

In [95]:
parameters = {
'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_n': (1, 2),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
}

In [101]:
from sklearn.grid_search import GridSearchCV

gridSearch = GridSearchCV(pipeline, parameters, n_jobs=-1)
gridSearch.fit(documents, categories)

valueByParameter = gridSearch.best_estimator_.get_params()
for parameter in gridSearch.param_grid:
print '%s: %r' % (parameter, valueByParameter[parameter])
print "Best score: %0.3f" % gridSearch.best_score_

In [92]:
print documents[27]