45. Pandas apply and numpy vectorization
When operating over Pandas dataframes, avoid using for loops and favor the apply function and Numpy vectorization.
45.1. Don’t do this
1import numpy as np
2import pandas as pd
3
4np.random.seed(37)
5
6def get_df():
7 N = 10000
8 M = 50
9
10 get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11 get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14 X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15 columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17 return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# standard for loop
22for row in range(len(df)):
23 total = np.sum(df.iloc[row][0:df.shape[1] - 1])
24 y = 1 if total > 1175 else 0
25 df['y'].iloc[row] = y
1import numpy as np
2import pandas as pd
3
4np.random.seed(37)
5
6def get_df():
7 N = 10000
8 M = 50
9
10 get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11 get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14 X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15 columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17 return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# pandas iterrows
22for i, r in df.iterrows():
23 total = np.sum(r[0:df.shape[1] - 1])
24 y = 1 if total > 1175 else 0
25 df['y'].iloc[row] = y
45.2. Do this
1import numpy as np
2import pandas as pd
3
4np.random.seed(37)
5
6def get_df():
7 N = 10000
8 M = 50
9
10 get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11 get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14 X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15 columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17 return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# pandas apply
22df['y'] = df.apply(lambda r: 1 if np.sum(r[0:df.shape[1] - 1]) > 1175 else 0, axis=1)
1import numpy as np
2import pandas as pd
3
4np.random.seed(37)
5
6def get_df():
7 N = 10000
8 M = 50
9
10 get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11 get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14 X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15 columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17 return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# numpy vectorization
22f = lambda s: 1 if s > 1175 else 0
23s = df[[c for c in df.columns if c != 'y']].values.sum(axis=1)
24df['y'] = [f(val) for val in s]