1. 构造数据集
import time
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
df_data = pd.DataFrame(np.random.choice([True,False],size=(10,2),p=[0.5,0.5]))
df_data
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: center;
}
|
0 |
1 |
| 0 |
True |
True |
| 1 |
True |
False |
| 2 |
False |
False |
| 3 |
True |
False |
| 4 |
False |
True |
| 5 |
True |
False |
| 6 |
False |
True |
| 7 |
True |
False |
| 8 |
True |
False |
| 9 |
False |
False |
2. replace方法
df_replace = df_data.copy(deep=True)
df_replace = df_replace.replace({True:1,False:0})
df_replace
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
0 |
1 |
| 0 |
1 |
1 |
| 1 |
1 |
0 |
| 2 |
0 |
0 |
| 3 |
1 |
0 |
| 4 |
0 |
1 |
| 5 |
1 |
0 |
| 6 |
0 |
1 |
| 7 |
1 |
0 |
| 8 |
1 |
0 |
| 9 |
0 |
0 |
3. applymap方法
df_applymap = df_data.copy(deep=True)
df_applymap = df_applymap.applymap(lambda x:1 if x else 0)
df_applymap
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: center;
}
|
0 |
1 |
| 0 |
1 |
1 |
| 1 |
1 |
0 |
| 2 |
0 |
0 |
| 3 |
1 |
0 |
| 4 |
0 |
1 |
| 5 |
1 |
0 |
| 6 |
0 |
1 |
| 7 |
1 |
0 |
| 8 |
1 |
0 |
| 9 |
0 |
0 |
4. astype方法
df_astype = df_data.copy(deep=True)
df_astype = df_astype.astype('int')
df_astype
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: center;
}
|
0 |
1 |
| 0 |
1 |
1 |
| 1 |
1 |
0 |
| 2 |
0 |
0 |
| 3 |
1 |
0 |
| 4 |
0 |
1 |
| 5 |
1 |
0 |
| 6 |
0 |
1 |
| 7 |
1 |
0 |
| 8 |
1 |
0 |
| 9 |
0 |
0 |
5. apply方法
df_apply = df_data.copy(deep=True)
df_apply = df_apply.apply(lambda x:x.apply(lambda y:1 if y else 0))
df_apply
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: center;
}
|
0 |
1 |
| 0 |
1 |
1 |
| 1 |
1 |
0 |
| 2 |
0 |
0 |
| 3 |
1 |
0 |
| 4 |
0 |
1 |
| 5 |
1 |
0 |
| 6 |
0 |
1 |
| 7 |
1 |
0 |
| 8 |
1 |
0 |
| 9 |
0 |
0 |
6. map方法
df_map = df_data.copy(deep=True)
df_map = df_map.map(lambda x:1 if x else 0)
df_map
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: center;
}
|
0 |
1 |
| 0 |
1 |
1 |
| 1 |
1 |
0 |
| 2 |
0 |
0 |
| 3 |
1 |
0 |
| 4 |
0 |
1 |
| 5 |
1 |
0 |
| 6 |
0 |
1 |
| 7 |
1 |
0 |
| 8 |
1 |
0 |
| 9 |
0 |
0 |
7. 效率对比
def use_replace(shape,df_data):
st = time.time()
df_data = df_data.replace({True:1,False:0})
et = time.time()
return (shape,"replace_func_run_time",round(et - st,6))
def use_applymap(shape,df_data):
st = time.time()
df_data = df_data.applymap(lambda x:1 if x else 0)
et = time.time()
return (shape,"applymap_func_run_time",round(et - st,6))
def use_astype(shape,df_data):
st = time.time()
df_data = df_data.astype("int")
et = time.time()
return (shape,"astype_func_run_time",round(et - st,6))
def use_apply(shape,df_data):
st = time.time()
df_data = df_data.apply(lambda x:x.apply(lambda y:1 if y else 0))
et = time.time()
return (shape,"apply_func_run_time",round(et - st,6))
def use_map(shape,df_data):
st = time.time()
df_data = df_data.map(lambda x:1 if x else 0)
et = time.time()
return (shape,"map_func_run_time",round(et - st,6))
run_times = []
for shape in tqdm(((10,10),(100,100),(1000,1000),(10000,10000))):
df_data = pd.DataFrame(np.random.choice([True,False],size=shape,p=[0.5,0.5]))
run_times.append(use_replace(shape,df_data))
run_times.append(use_applymap(shape,df_data))
run_times.append(use_astype(shape,df_data))
run_times.append(use_apply(shape,df_data))
run_times.append(use_map(shape,df_data))
pd.DataFrame(run_times,columns=["shape","run_name_func","run_time"])
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: center;
}
|
shape |
run_name_func |
run_time |
| 0 |
(10, 10) |
replace_func_run_time |
0.001113 |
| 1 |
(10, 10) |
applymap_func_run_time |
0.000500 |
| 2 |
(10, 10) |
astype_func_run_time |
0.000127 |
| 3 |
(10, 10) |
apply_func_run_time |
0.000616 |
| 4 |
(10, 10) |
map_func_run_time |
0.000245 |
| 5 |
(100, 100) |
replace_func_run_time |
0.006937 |
| 6 |
(100, 100) |
applymap_func_run_time |
0.003279 |
| 7 |
(100, 100) |
astype_func_run_time |
0.000105 |
| 8 |
(100, 100) |
apply_func_run_time |
0.004956 |
| 9 |
(100, 100) |
map_func_run_time |
0.003031 |
| 10 |
(1000, 1000) |
replace_func_run_time |
0.207553 |
| 11 |
(1000, 1000) |
applymap_func_run_time |
0.156537 |
| 12 |
(1000, 1000) |
astype_func_run_time |
0.000704 |
| 13 |
(1000, 1000) |
apply_func_run_time |
0.170808 |
| 14 |
(1000, 1000) |
map_func_run_time |
0.155087 |
| 15 |
(10000, 10000) |
replace_func_run_time |
17.793814 |
| 16 |
(10000, 10000) |
applymap_func_run_time |
15.354567 |
| 17 |
(10000, 10000) |
astype_func_run_time |
0.070461 |
| 18 |
(10000, 10000) |
apply_func_run_time |
15.329245 |
| 19 |
(10000, 10000) |
map_func_run_time |
15.474624 |
8. 总结
从上图表数据来看,用astype效率较高。因此在大数据量的情况下,优先采用astype的方式来进行True/False数值转化