본문 바로가기
카테고리 없음

[python]auto_ml titanic code

by  반  2020. 6. 18.

test.csv
0.03MB
train.csv
0.06MB

같은 폴더 내에 input 폴더 만들고 넣으면 됨

numpy, pandas, h2o 패키지 필요함

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("./input"))
# Any results you write to the current directory are saved as output.

#data cleaning and feature engineering 
def get_name_prefix(data):
    prefix = pd.Series(np.ones(data.shape[0]), index=data.index)
    data['Prefix'] = prefix
    data.loc[data.Name.str.contains('Miss.', regex=False), 'Prefix'] = 2
    data.loc[data.Name.str.contains('Mrs.', regex=False), 'Prefix'] = 3
    data.loc[data.Name.str.contains('Mr.', regex=False), 'Prefix'] = 4    

# https://stackoverflow.com/a/42523230
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        del df[each]
        df = pd.concat([df, dummies], axis=1)
    return df

def normalize(df, mean, std):
    """
    @param df pandas DataFrame
    @param mean pandas Series of column values mean
    @param std pandas Series of column values standard deviation
    """
    for i in range(mean.size):
        df[mean.index[i]] = (df[mean.index[i]] - mean[0]) / std[0] 

def process_data(data):
    # get prefix data
    get_name_prefix(data)
    # remove name and ticket
    data.drop(['Ticket', 'Name'], inplace=True, axis=1)
    # sex
    data.loc[data.Sex != 'male', 'Sex'] = 0;
    data.loc[data.Sex == 'male', 'Sex'] = 1;
    # cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    # embarked
    data.Embarked.fillna(0, inplace=True)
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    data.loc[data.Embarked == 'S', 'Embarked'] = 3
    data.fillna(-1, inplace=True)
    
    data = one_hot(data, ('Pclass', 'Sex', 'Cabin', 'Embarked', 'Prefix'))
    return data.astype(float)

#load data
train_raw = pd.read_csv('./input/train.csv')
test_raw = pd.read_csv('./input/test.csv')

train = process_data(train_raw)
test = process_data(test_raw)

data_mean = train[['Age','Fare','SibSp','Parch']].mean(axis=0)
data_std = train[['Age','Fare','SibSp','Parch']].std(axis=0)

normalize(train, data_mean, data_std)
normalize(test, data_mean, data_std)

test, train = test.align(train, axis=1, fill_value=0)

#start H2O 
import h2o
from h2o.automl import H2OAutoML

h2o.init()

#load data as h2o frames
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

#drop passengerId from data set
passId = test['PassengerId']
train = train.drop('PassengerId',axis =1)
test = test.drop('PassengerId',axis =1)

#identify predictors and labels
x = train.columns
y = 'Survived'
x.remove(y)

#for binary classification, lables should be a factor
train[y] = train[y].asfactor()

# Run AutoML
aml_ti = H2OAutoML(max_runtime_secs= 120,max_models= 10, seed= 7,nfolds= 10)
aml_ti.train(x = x, y = y,
          training_frame = train)

#check the leaderboard
lb_ti = aml_ti.leaderboard
lb_ti

#prediction
pred = aml_ti.leader.predict(test)

#save predict results to submission form
pred_df = pred.as_data_frame()
pred_res = pred_df.predict
passId_df = passId.as_data_frame()
res_ti = pd.concat([passId_df,pred_res],axis=1,ignore_index = True)
res_ti.columns = ['PassengerId','Survived']
res_ti.to_csv('mypred.csv',index=False)

#http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

댓글