Data Pipelines and XGBoost

Data from Kaggle

Pipelines

import pandas as pd

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

DATA_FILE = 'sample-data/mobile-price-classification/train.csv'

Import Data

df = pd.read_csv(DATA_FILE)

df.head()

	battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	pc	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	price_range
0	842	0	2.2	0	1	0	7	0.6	188	2	2	20	756	2549	9	7	19	0	0	1	1
1	1021	1	0.5	1	0	1	53	0.7	136	3	6	905	1988	2631	17	3	7	1	1	0	2
2	563	1	0.5	1	2	1	41	0.9	145	5	6	1263	1716	2603	11	2	9	1	1	0	2
3	615	1	2.5	0	0	0	10	0.8	131	6	9	1216	1786	2769	16	8	11	1	0	0	2
4	1821	1	1.2	0	13	1	44	0.6	141	2	14	1208	1212	1411	8	2	15	1	1	0	1

Separate Variables

X = df.drop('price_range', axis=1)
y = df.price_range

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

Split Data types

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

X_train.head()

	battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	pc	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi
582	1232	0	2.9	1	1	1	24	0.3	169	5	17	361	809	1257	16	10	16	1	0	0
159	1840	0	0.5	1	12	0	34	0.7	142	1	16	311	1545	1078	8	0	10	0	0	0
1827	1692	0	2.1	0	4	1	2	0.9	106	1	17	1899	1904	3779	9	3	7	1	1	1
318	508	0	0.8	0	7	1	42	0.3	94	1	8	39	557	663	13	12	7	1	0	0
708	977	1	2.8	1	2	0	35	0.6	165	2	15	1502	1862	3714	19	3	10	0	1	1

Create Transformers for the diferent types of data

numerical_transformer = SimpleImputer(strategy='constant')


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Add these transformers to a Preprocessor

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Define Model

model = RandomForestRegressor(n_estimators=100, random_state=0)

Create Pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

Run the Pipeline

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['battery_power', 'blue',
                                                   'clock_speed', 'dual_sim',
                                                   'fc', 'four_g', 'int_memory',
                                                   'm_dep...
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                                       oob_score=False, random_state=0,
                                       verbose=0, warm_start=False))],
         verbose=False)

predictions = pipeline.predict(X_valid)

results = X_valid
results['predicted'] = predictions
results['actual'] = y_valid
results['diff'] = abs(results['predicted'] - results['actual'])
results.head(10)

	battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	pc	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	predicted	actual	diff
405	1454	1	0.5	1	1	0	34	0.7	83	4	3	250	1033	3419	7	5	5	1	1	0	2.98	3	0.02
1190	1092	1	0.5	1	10	0	11	0.5	167	3	14	468	571	737	14	4	11	0	1	0	0.00	0	0.00
1132	1524	1	1.8	1	0	0	10	0.6	174	4	1	154	550	2678	16	5	13	1	0	1	1.97	2	0.03
731	1807	1	2.1	0	2	0	49	0.8	125	1	10	337	1384	1906	17	13	13	0	1	1	1.69	2	0.31
1754	1086	1	1.7	1	0	1	43	0.2	111	6	1	56	1150	3285	11	5	17	1	1	0	2.70	2	0.70
1178	909	1	0.5	1	9	0	30	0.4	97	3	10	290	773	594	12	0	4	1	1	1	0.00	0	0.00
1533	642	1	0.5	0	0	1	38	0.8	86	5	10	887	1775	435	9	2	2	1	1	0	0.08	0	0.08
1303	888	0	2.6	1	2	1	33	0.4	198	2	17	327	1683	3407	12	1	20	1	0	0	2.68	3	0.32
1857	914	1	0.7	0	1	1	60	0.9	198	5	4	740	840	3736	14	8	5	1	0	0	2.84	3	0.16
18	1131	1	0.5	1	11	0	49	0.6	101	5	18	658	878	1835	19	13	16	1	1	0	0.95	1	0.05

Get the Mean Absolute Error

The MAE is a value that tells us within what distance of the actual value our prediction will fall, in this case it means that the model will return a result within 0.17 of the actual result (in our case our expected values range between 1 and 3) on average

score = mean_absolute_error(y_valid, predictions)
print('MAE:', score)

MAE: 0.171375

XGBoost

We'll use the same data as above

df = pd.read_csv(DATA_FILE)

X = df.drop('price_range', axis=1)
y = df.price_range

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Train the Model

Using the Default Parameters

1. Create Model Instance

Below we have an example of a model instance created using no parameters, so everything is defaulted, and we can train that like so:

model = XGBRegressor()

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

pipeline.fit(X_train, y_train)

Some of the params we can set are:

n_estimators which is essentially how many models we want in the ensemble, this is usually between 100 and 1000 but is impacted by the learning rate
learning_rate is how much we want the model to retain between passes, by default this is 0.1, but we can choose a lower value which will mean the model retains less, this can help us to prevent overfitting
early_stopping_rounds is the number of rounds after deteration that we want the model to stop increasing the n_estimators this is done by giving it a set of testing data eval_set which it will use to optimize with, a good value for this is early_stopping_rounds=5
objective is a string or function that lets us specify the objective/type of model we would like to build - a list of objectives can be found here
If using a multi-class (multi:softmax) classifier you also have to state the number of classes as num_class=4

Below we'll use a bit of a more complex mode configuration

model = XGBRegressor(n_estimators=1000, learning_rate=0.1, objective='multi:softmax', num_class=4)

2. Add the Model to a Pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

3. Train the Pipeline

Note that we need to pre-format our eval_set data so that it has the proprocessing steps applied so that the data structures are aligned
We also need to prefix any inputs that we want passed on to our model with model__ so that the pipeline passes it to the correct object

preprocessor.fit(X_valid)
X_valid_transformed = preprocessor.transform(X_valid)

pipeline.fit(X_train, y_train, 
                model__early_stopping_rounds=20, 
                model__eval_set=[(X_valid_transformed, y_valid)],
                model__verbose=False)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['battery_power', 'blue',
                                                   'clock_speed', 'dual_sim',
                                                   'fc', 'four_g', 'int_memory',
                                                   'm_dep...
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
                              importance_type='gain', learning_rate=0.1,
                              max_delta_step=0, max_depth=3, min_child_weight=1,
                              missing=None, n_estimators=1000, n_jobs=1,
                              nthread=None, num_class=4,
                              objective='multi:softmax', random_state=0,
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                              seed=None, silent=None, subsample=1,
                              verbosity=1))],
         verbose=False)

4. Predict using the Pipeline

predictions = pipeline.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 0.0575

Cross Validation

We can do cross-validation using the cross_val_score function from sklearn by:

Defining the pipeline
Defining the number of folds
Defining the model
Applying the cross-validation to the pipeline

1. Define the Pipeline

df = pd.read_csv(DATA_FILE)

X = df.drop('price_range', axis=1)
y = df.price_range

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# n_estimators based on the previous value
model = XGBRegressor(n_estimators=190, learning_rate=0.1, objective='multi:softmax', num_class=4)

validation_result = cross_val_score(pipeline, X, y, cv=3)

validation_result

Sklearn Pipelines and XGBoost