切分資料¶
- .cut(): 將資料切分到 bins 裡面去。
- .qcut(): 將資料盡量平均分配到指定數量的bin裡面去。
In [1]:
Copied!
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
In [16]:
Copied!
df1 = pd.DataFrame(np.linspace(0.5, 9.5, 10))
df1 = pd.DataFrame(np.linspace(0.5, 9.5, 10))
In [17]:
Copied!
df1
df1
Out[17]:
0 | |
---|---|
0 | 0.5 |
1 | 1.5 |
2 | 2.5 |
3 | 3.5 |
4 | 4.5 |
5 | 5.5 |
6 | 6.5 |
7 | 7.5 |
8 | 8.5 |
9 | 9.5 |
In [18]:
Copied!
np.random.seed(987)
df2 = pd.DataFrame(np.random.normal(5, 2, 10))
np.random.seed(987)
df2 = pd.DataFrame(np.random.normal(5, 2, 10))
In [19]:
Copied!
df2
df2
Out[19]:
0 | |
---|---|
0 | 1.628146 |
1 | 2.057760 |
2 | 4.776400 |
3 | 7.072740 |
4 | 6.322080 |
5 | 6.765825 |
6 | 2.833345 |
7 | 3.305737 |
8 | 5.316556 |
9 | 5.198195 |
.cut()¶
In [20]:
Copied!
pd.cut(df1[0], 3)
pd.cut(df1[0], 3)
Out[20]:
0 (0.491, 3.5] 1 (0.491, 3.5] 2 (0.491, 3.5] 3 (0.491, 3.5] 4 (3.5, 6.5] 5 (3.5, 6.5] 6 (3.5, 6.5] 7 (6.5, 9.5] 8 (6.5, 9.5] 9 (6.5, 9.5] Name: 0, dtype: category Categories (3, interval[float64, right]): [(0.491, 3.5] < (3.5, 6.5] < (6.5, 9.5]]
In [21]:
Copied!
pd.cut(df1[0], range(0, 11, 2))
pd.cut(df1[0], range(0, 11, 2))
Out[21]:
0 (0, 2] 1 (0, 2] 2 (2, 4] 3 (2, 4] 4 (4, 6] 5 (4, 6] 6 (6, 8] 7 (6, 8] 8 (8, 10] 9 (8, 10] Name: 0, dtype: category Categories (5, interval[int64, right]): [(0, 2] < (2, 4] < (4, 6] < (6, 8] < (8, 10]]
In [22]:
Copied!
pd.cut(df1[0], range(0, 11, 2), labels=[1, 2, 3, 4, 5])
pd.cut(df1[0], range(0, 11, 2), labels=[1, 2, 3, 4, 5])
Out[22]:
0 1 1 1 2 2 3 2 4 3 5 3 6 4 7 4 8 5 9 5 Name: 0, dtype: category Categories (5, int64): [1 < 2 < 3 < 4 < 5]
.qcut()¶
In [23]:
Copied!
df1
df1
Out[23]:
0 | |
---|---|
0 | 0.5 |
1 | 1.5 |
2 | 2.5 |
3 | 3.5 |
4 | 4.5 |
5 | 5.5 |
6 | 6.5 |
7 | 7.5 |
8 | 8.5 |
9 | 9.5 |
In [24]:
Copied!
pd.qcut(df1[0], 5)
pd.qcut(df1[0], 5)
Out[24]:
0 (0.499, 2.3] 1 (0.499, 2.3] 2 (2.3, 4.1] 3 (2.3, 4.1] 4 (4.1, 5.9] 5 (4.1, 5.9] 6 (5.9, 7.7] 7 (5.9, 7.7] 8 (7.7, 9.5] 9 (7.7, 9.5] Name: 0, dtype: category Categories (5, interval[float64, right]): [(0.499, 2.3] < (2.3, 4.1] < (4.1, 5.9] < (5.9, 7.7] < (7.7, 9.5]]
In [25]:
Copied!
pd.qcut(df2[0], 5)
pd.qcut(df2[0], 5)
Out[25]:
0 (1.627, 2.678] 1 (1.627, 2.678] 2 (4.188, 5.246] 3 (6.411, 7.073] 4 (5.246, 6.411] 5 (6.411, 7.073] 6 (2.678, 4.188] 7 (2.678, 4.188] 8 (5.246, 6.411] 9 (4.188, 5.246] Name: 0, dtype: category Categories (5, interval[float64, right]): [(1.627, 2.678] < (2.678, 4.188] < (4.188, 5.246] < (5.246, 6.411] < (6.411, 7.073]]
In [26]:
Copied!
pd.qcut(df2[0], 5, labels=[1, 2, 3, 4, 5])
pd.qcut(df2[0], 5, labels=[1, 2, 3, 4, 5])
Out[26]:
0 1 1 1 2 3 3 5 4 4 5 5 6 2 7 2 8 4 9 3 Name: 0, dtype: category Categories (5, int64): [1 < 2 < 3 < 4 < 5]