缺失值處理¶
In [1]:
Copied!
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
In [2]:
Copied!
np.random.seed(987)
data = np.random.randint(20, 101, (10, 4))
np.random.seed(987)
data = np.random.randint(20, 101, (10, 4))
In [3]:
Copied!
df = pd.DataFrame(data)
df.columns = ['col1', 'col2', 'col3', 'col4']
df = pd.DataFrame(data)
df.columns = ['col1', 'col2', 'col3', 'col4']
In [4]:
Copied!
df
df
Out[4]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71 | 61 | 74 | 30 |
1 | 46 | 23 | 94 | 64 |
2 | 38 | 82 | 34 | 48 |
3 | 23 | 35 | 87 | 84 |
4 | 46 | 74 | 51 | 81 |
5 | 53 | 46 | 93 | 98 |
6 | 99 | 99 | 39 | 70 |
7 | 54 | 30 | 79 | 92 |
8 | 87 | 91 | 91 | 61 |
9 | 23 | 22 | 66 | 57 |
In [5]:
Copied!
df.iloc[1, 2] = np.nan
df.iloc[3, 1] = np.nan
df.iloc[3, 3] = np.nan
df.iloc[5, 0] = np.nan
df.iloc[5, 1] = np.nan
df.iloc[7, 0] = np.nan
df.iloc[7, 1] = np.nan
df.iloc[7, 2] = np.nan
df.iloc[7, 3] = np.nan
df.iloc[1, 2] = np.nan
df.iloc[3, 1] = np.nan
df.iloc[3, 3] = np.nan
df.iloc[5, 0] = np.nan
df.iloc[5, 1] = np.nan
df.iloc[7, 0] = np.nan
df.iloc[7, 1] = np.nan
df.iloc[7, 2] = np.nan
df.iloc[7, 3] = np.nan
In [6]:
Copied!
df
df
Out[6]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | NaN | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | NaN | 87.0 | NaN |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
5 | NaN | NaN | 93.0 | 98.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
7 | NaN | NaN | NaN | NaN |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
.dropna()¶
In [7]:
Copied!
df.dropna()
df.dropna()
Out[7]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [8]:
Copied!
df.dropna(subset=['col1'])
df.dropna(subset=['col1'])
Out[8]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | NaN | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | NaN | 87.0 | NaN |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [9]:
Copied!
df.dropna(subset=['col1', 'col3'])
df.dropna(subset=['col1', 'col3'])
Out[9]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | NaN | 87.0 | NaN |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [10]:
Copied!
df.dropna(how='all')
df.dropna(how='all')
Out[10]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | NaN | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | NaN | 87.0 | NaN |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
5 | NaN | NaN | 93.0 | 98.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [11]:
Copied!
df.dropna(thresh=3)
df.dropna(thresh=3)
Out[11]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | NaN | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
.fillna()¶
In [12]:
Copied!
df
df
Out[12]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | NaN | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | NaN | 87.0 | NaN |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
5 | NaN | NaN | 93.0 | 98.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
7 | NaN | NaN | NaN | NaN |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [13]:
Copied!
df.fillna(0)
df.fillna(0)
Out[13]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | 0.0 | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | 0.0 | 87.0 | 0.0 |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
5 | 0.0 | 0.0 | 93.0 | 98.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
7 | 0.0 | 0.0 | 0.0 | 0.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [15]:
Copied!
df.fillna(df.mean())
df.fillna(df.mean())
Out[15]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.000 | 61.000000 | 74.000 | 30.000 |
1 | 46.000 | 23.000000 | 66.875 | 64.000 |
2 | 38.000 | 82.000000 | 34.000 | 48.000 |
3 | 23.000 | 64.571429 | 87.000 | 63.625 |
4 | 46.000 | 74.000000 | 51.000 | 81.000 |
5 | 54.125 | 64.571429 | 93.000 | 98.000 |
6 | 99.000 | 99.000000 | 39.000 | 70.000 |
7 | 54.125 | 64.571429 | 66.875 | 63.625 |
8 | 87.000 | 91.000000 | 91.000 | 61.000 |
9 | 23.000 | 22.000000 | 66.000 | 57.000 |
In [16]:
Copied!
df.fillna(method='ffill')
df.fillna(method='ffill')
Out[16]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | 74.0 | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | 82.0 | 87.0 | 48.0 |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
5 | 46.0 | 74.0 | 93.0 | 98.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
7 | 99.0 | 99.0 | 39.0 | 70.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [17]:
Copied!
df.fillna(method='pad')
df.fillna(method='pad')
Out[17]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | 74.0 | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | 82.0 | 87.0 | 48.0 |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
5 | 46.0 | 74.0 | 93.0 | 98.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
7 | 99.0 | 99.0 | 39.0 | 70.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |
In [18]:
Copied!
df.fillna(method='bfill')
df.fillna(method='bfill')
Out[18]:
col1 | col2 | col3 | col4 | |
---|---|---|---|---|
0 | 71.0 | 61.0 | 74.0 | 30.0 |
1 | 46.0 | 23.0 | 34.0 | 64.0 |
2 | 38.0 | 82.0 | 34.0 | 48.0 |
3 | 23.0 | 74.0 | 87.0 | 81.0 |
4 | 46.0 | 74.0 | 51.0 | 81.0 |
5 | 99.0 | 99.0 | 93.0 | 98.0 |
6 | 99.0 | 99.0 | 39.0 | 70.0 |
7 | 87.0 | 91.0 | 91.0 | 61.0 |
8 | 87.0 | 91.0 | 91.0 | 61.0 |
9 | 23.0 | 22.0 | 66.0 | 57.0 |