45. Pandas apply and numpy vectorization

When operating over Pandas dataframes, avoid using for loops and favor the apply function and Numpy vectorization.

45.1. Don’t do this

 1import numpy as np
 2import pandas as pd
 3
 4np.random.seed(37)
 5
 6def get_df():
 7    N = 10000
 8    M = 50
 9
10    get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11    get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14    X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15    columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17    return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# standard for loop
22for row in range(len(df)):
23    total = np.sum(df.iloc[row][0:df.shape[1] - 1])
24    y = 1 if total > 1175 else 0
25    df['y'].iloc[row] = y
 1import numpy as np
 2import pandas as pd
 3
 4np.random.seed(37)
 5
 6def get_df():
 7    N = 10000
 8    M = 50
 9
10    get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11    get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14    X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15    columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17    return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# pandas iterrows
22for i, r in df.iterrows():
23    total = np.sum(r[0:df.shape[1] - 1])
24    y = 1 if total > 1175 else 0
25    df['y'].iloc[row] = y

45.2. Do this

 1import numpy as np
 2import pandas as pd
 3
 4np.random.seed(37)
 5
 6def get_df():
 7    N = 10000
 8    M = 50
 9
10    get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11    get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14    X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15    columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17    return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# pandas apply
22df['y'] = df.apply(lambda r: 1 if np.sum(r[0:df.shape[1] - 1]) > 1175 else 0, axis=1)
 1import numpy as np
 2import pandas as pd
 3
 4np.random.seed(37)
 5
 6def get_df():
 7    N = 10000
 8    M = 50
 9
10    get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11    get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14    X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15    columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17    return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# numpy vectorization
22f = lambda s: 1 if s > 1175 else 0
23s = df[[c for c in df.columns if c != 'y']].values.sum(axis=1)
24df['y'] = [f(val) for val in s]