1# Basic syntax:
2df_onehot = pd.get_dummies(df, columns=['col_name'], prefix=['one_hot'])
3# Where:
4# - get_dummies creates a one-hot encoding for each unique categorical
5# value in the column named col_name
6# - The prefix is added at the beginning of each categorical value
7# to create new column names for the one-hot columns
8
9# Example usage:
10# Build example dataframe:
11df = pd.DataFrame(['sunny', 'rainy', 'cloudy'], columns=['weather'])
12print(df)
13 weather
140 sunny
151 rainy
162 cloudy
17
18# Convert categorical weather variable to one-hot encoding:
19df_onehot = pd.get_dummies(df, columns=['weather'], prefix=['one_hot'])
20print(df_onehot)
21 one_hot_cloudy one_hot_rainy one_hot_sunny
220 0 0 1
231 0 1 0
242 1 0 0
1>>> from sklearn.preprocessing import OneHotEncoder
2One can discard categories not seen during fit:
3
4>>>
5>>> enc = OneHotEncoder(handle_unknown='ignore')
6>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
7>>> enc.fit(X)
8OneHotEncoder(handle_unknown='ignore')
9>>> enc.categories_
10[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
11>>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
12array([[1., 0., 1., 0., 0.],
13 [0., 1., 0., 0., 0.]])
14>>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
15array([['Male', 1],
16 [None, 2]], dtype=object)
17>>> enc.get_feature_names(['gender', 'group'])
18array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
19 dtype=object)
20One can always drop the first column for each feature:
21
22>>>
23>>> drop_enc = OneHotEncoder(drop='first').fit(X)
24>>> drop_enc.categories_
25[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
26>>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
27array([[0., 0., 0.],
28 [1., 1., 0.]])
29Or drop a column for feature only having 2 categories:
30
31>>>
32>>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
33>>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
34array([[0., 1., 0., 0.],
35 [1., 0., 1., 0.]])
1from sklearn.compose import ColumnTransformer
2
3ct = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder='passthrough')
4X = np.array(ct.fit_transform(X), dtype=np.float)
1from sklearn.preprocessing import OneHotEncoder
2enc = OneHotEncoder()
3# transforming the column after fitting
4enc = enc.fit_transform(df[['nom_0']]).toarray()
5# converting arrays to a dataframe
6encoded_colm = pd.DataFrame(enc)
7# concating dataframes
8df = pd.concat([df, encoded_colm], axis = 1)
9# removing the encoded column.
10df = df.drop(['nom_0'], axis = 1)
11df.head(10)
12