In below code of pipeline. Even though i have encoded the sex column, i am getting string to float error.
from sklearnpose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np
# Step 1: Imputation
trf1 = ColumnTransformer([
('impute_age', SimpleImputer(), [2]), # Impute Age
('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]) # Impute Embarked
], remainder='passthrough')
# Step 2: One-Hot Encoding
trf2 = ColumnTransformer([
('onehot_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6]) # Encode Sex and Embarked
], remainder='passthrough')
# Step 3: Scaling
trf3 = ColumnTransformer([
('scale', MinMaxScaler(), slice(0, None)) # Scale all columns
], remainder='passthrough')
# Step 4: Classifier
trf4 = DecisionTreeClassifier()
# Create pipeline
pipe = Pipeline([
('trf1', trf1), # Step 1: Imputation
('trf2', trf2), # Step 2: One-hot encoding
('trf3', trf3), # Step 3: Scaling
('trf4', trf4) # Step 4: Model
])
# Ensure proper ha
# Fit the pipeline
pipe.fit(X_train, y_train)
Error:
What is the reason to the error?
In below code of pipeline. Even though i have encoded the sex column, i am getting string to float error.
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np
# Step 1: Imputation
trf1 = ColumnTransformer([
('impute_age', SimpleImputer(), [2]), # Impute Age
('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]) # Impute Embarked
], remainder='passthrough')
# Step 2: One-Hot Encoding
trf2 = ColumnTransformer([
('onehot_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6]) # Encode Sex and Embarked
], remainder='passthrough')
# Step 3: Scaling
trf3 = ColumnTransformer([
('scale', MinMaxScaler(), slice(0, None)) # Scale all columns
], remainder='passthrough')
# Step 4: Classifier
trf4 = DecisionTreeClassifier()
# Create pipeline
pipe = Pipeline([
('trf1', trf1), # Step 1: Imputation
('trf2', trf2), # Step 2: One-hot encoding
('trf3', trf3), # Step 3: Scaling
('trf4', trf4) # Step 4: Model
])
# Ensure proper ha
# Fit the pipeline
pipe.fit(X_train, y_train)
Error:
What is the reason to the error?
Share Improve this question edited Jan 19 at 19:31 James Z 12.3k10 gold badges27 silver badges47 bronze badges asked Jan 19 at 13:10 Abubakker HashmiAbubakker Hashmi 91 bronze badge 2 |1 Answer
Reset to default 0It is generally not a very good idea to use multiple ColumnTransformer()
stages in a row as the column index control becomes tedious, if even possible.
I'd advise wrapping transforms into a Pipeline()
for each transformation group and thus sticking to a single ColumnTransformer()
.
Wrapping a scaler into a ColumnTransformer()
is also redundant (and the effect of a scaler on a tree model is likely insignificant).
Revised code, assuming the titanic column naming:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np
# Step 1: Preprocessing
trf1 = ColumnTransformer(
[
('impute', SimpleImputer(), ['Age']),
('impute_and_encode', Pipeline([
('impute', SimpleImputer(strategy='most_frequent')),
('encode', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')),
]), ['Embarked']),
('encode', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), ['Sex']),
],
remainder='passthrough',
force_int_remainder_cols=False,
)
# Step 2: Scaling
trf2 = MinMaxScaler()
# Step 3: Classifier
trf3 = DecisionTreeClassifier(random_state=177013)
# Create pipeline
pipe = Pipeline([
('trf1', trf1), # Step 1: Preprocessing
('trf2', trf2), # Step 2: Scaling
('trf3', trf3) # Step 3: Model
])
# Fit the pipeline
pipe.fit(X_train, y_train)
Embarked
to be column number 6 both before and after the imputation step. It won't be. See e.g. stackoverflow.com/q/62225230/10495893 and its Linked pages – Ben Reiniger Commented Jan 19 at 14:32