`Pandas`讲`True/False`映射为1/0

53 阅读3分钟

1. 构造数据集

import time
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
df_data = pd.DataFrame(np.random.choice([True,False],size=(10,2),p=[0.5,0.5]))
df_data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: center; }
0 1
0 True True
1 True False
2 False False
3 True False
4 False True
5 True False
6 False True
7 True False
8 True False
9 False False

2. replace方法

df_replace = df_data.copy(deep=True)
df_replace = df_replace.replace({True:1,False:0})
df_replace
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1
0 1 1
1 1 0
2 0 0
3 1 0
4 0 1
5 1 0
6 0 1
7 1 0
8 1 0
9 0 0

3. applymap方法

df_applymap = df_data.copy(deep=True)
df_applymap = df_applymap.applymap(lambda x:1 if x else 0)
df_applymap
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: center; }
0 1
0 1 1
1 1 0
2 0 0
3 1 0
4 0 1
5 1 0
6 0 1
7 1 0
8 1 0
9 0 0

4. astype方法

df_astype = df_data.copy(deep=True)
df_astype = df_astype.astype('int')
df_astype
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: center; }
0 1
0 1 1
1 1 0
2 0 0
3 1 0
4 0 1
5 1 0
6 0 1
7 1 0
8 1 0
9 0 0

5. apply方法

df_apply = df_data.copy(deep=True)
df_apply = df_apply.apply(lambda x:x.apply(lambda y:1 if y else 0))
df_apply
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: center; }
0 1
0 1 1
1 1 0
2 0 0
3 1 0
4 0 1
5 1 0
6 0 1
7 1 0
8 1 0
9 0 0

6. map方法

df_map = df_data.copy(deep=True)
df_map = df_map.map(lambda x:1 if x else 0)
df_map
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: center; }
0 1
0 1 1
1 1 0
2 0 0
3 1 0
4 0 1
5 1 0
6 0 1
7 1 0
8 1 0
9 0 0

7. 效率对比


def use_replace(shape,df_data):
    st = time.time()
    df_data = df_data.replace({True:1,False:0})
    et = time.time()
    return (shape,"replace_func_run_time",round(et - st,6))
def use_applymap(shape,df_data):
    st = time.time()
    df_data = df_data.applymap(lambda x:1 if x else 0)
    et = time.time()
    return (shape,"applymap_func_run_time",round(et - st,6))
    
def use_astype(shape,df_data):
    st = time.time()
    df_data = df_data.astype("int")
    et = time.time()
    return (shape,"astype_func_run_time",round(et - st,6))

def use_apply(shape,df_data):
    st = time.time()
    df_data = df_data.apply(lambda x:x.apply(lambda y:1 if y else 0))
    et = time.time()
    return (shape,"apply_func_run_time",round(et - st,6))

def use_map(shape,df_data):
    st = time.time()
    df_data = df_data.map(lambda x:1 if x else 0)
    et = time.time()
    return (shape,"map_func_run_time",round(et - st,6))
run_times = []
for shape in tqdm(((10,10),(100,100),(1000,1000),(10000,10000))):
    df_data = pd.DataFrame(np.random.choice([True,False],size=shape,p=[0.5,0.5]))
    run_times.append(use_replace(shape,df_data))
    
    run_times.append(use_applymap(shape,df_data))

    run_times.append(use_astype(shape,df_data))

    run_times.append(use_apply(shape,df_data))

    run_times.append(use_map(shape,df_data))
pd.DataFrame(run_times,columns=["shape","run_name_func","run_time"])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: center; }
shape run_name_func run_time
0 (10, 10) replace_func_run_time 0.001113
1 (10, 10) applymap_func_run_time 0.000500
2 (10, 10) astype_func_run_time 0.000127
3 (10, 10) apply_func_run_time 0.000616
4 (10, 10) map_func_run_time 0.000245
5 (100, 100) replace_func_run_time 0.006937
6 (100, 100) applymap_func_run_time 0.003279
7 (100, 100) astype_func_run_time 0.000105
8 (100, 100) apply_func_run_time 0.004956
9 (100, 100) map_func_run_time 0.003031
10 (1000, 1000) replace_func_run_time 0.207553
11 (1000, 1000) applymap_func_run_time 0.156537
12 (1000, 1000) astype_func_run_time 0.000704
13 (1000, 1000) apply_func_run_time 0.170808
14 (1000, 1000) map_func_run_time 0.155087
15 (10000, 10000) replace_func_run_time 17.793814
16 (10000, 10000) applymap_func_run_time 15.354567
17 (10000, 10000) astype_func_run_time 0.070461
18 (10000, 10000) apply_func_run_time 15.329245
19 (10000, 10000) map_func_run_time 15.474624

8. 总结

从上图表数据来看,用astype效率较高。因此在大数据量的情况下,优先采用astype的方式来进行True/False数值转化