5. Pandas apply and numpy vectorization

When operating over Pandas dataframes, avoid row-by-row Python loops. Prefer vectorized Pandas or NumPy operations. apply can be useful, but it is usually not the fastest option.

In dataframe code, the highest-leverage improvement is usually moving work from Python-level row iteration into array-oriented operations. That shift tends to improve both performance and clarity because the code starts describing whole-column transformations.

5.1. Don’t do this

 1import numpy as np
 2import pandas as pd
 3
 4np.random.seed(37)
 5
 6def get_df():
 7    N = 10000
 8    M = 50
 9
10    get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11    get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14    X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15    columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17    return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# standard for loop
22for row in range(len(df)):
23    total = np.sum(df.iloc[row][0:df.shape[1] - 1])
24    y = 1 if total > 1175 else 0
25    df['y'].iloc[row] = y
 1import numpy as np
 2import pandas as pd
 3
 4np.random.seed(37)
 5
 6def get_df():
 7    N = 10000
 8    M = 50
 9
10    get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11    get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14    X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15    columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17    return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21# pandas iterrows
22for i, r in df.iterrows():
23    total = np.sum(r[0:df.shape[1] - 1])
24    y = 1 if total > 1175 else 0
25    df['y'].iloc[i] = y

5.2. Do this

 1import numpy as np
 2import pandas as pd
 3
 4np.random.seed(37)
 5
 6def get_df():
 7    N = 10000
 8    M = 50
 9
10    get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
11    get_y = lambda x: np.full(N, -1).reshape(-1, 1)
12
13
14    X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
15    columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
16
17    return pd.DataFrame(X, columns=columns)
18
19df = get_df()
20
21feature_columns = [c for c in df.columns if c != 'y']
22totals = df[feature_columns].sum(axis=1)
23df['y'] = np.where(totals > 1175, 1, 0)