資料的分群(groupby)運算¶
在這個網頁中,我們介紹一下 .groupby()
的使用。
In [ ]:
Copied!
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
產生虛擬資料集¶
In [ ]:
Copied!
keys = np.random.choice(['A','B','C'], 20)
keys = np.random.choice(['A','B','C'], 20)
In [ ]:
Copied!
keys
keys
Out[ ]:
array(['B', 'C', 'C', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'A', 'B', 'B', 'A', 'C', 'B', 'C', 'C', 'A', 'B'], dtype='<U1')
In [ ]:
Copied!
values = np.random.randint(0, 10, 20)
values = np.random.randint(0, 10, 20)
In [ ]:
Copied!
df = pd.DataFrame(zip(keys, values))
df = pd.DataFrame(zip(keys, values))
In [ ]:
Copied!
df
df
Out[ ]:
0 | 1 | |
---|---|---|
0 | B | 4 |
1 | C | 7 |
2 | C | 5 |
3 | B | 9 |
4 | B | 7 |
5 | B | 4 |
6 | B | 9 |
7 | B | 3 |
8 | C | 5 |
9 | C | 0 |
10 | A | 1 |
11 | B | 4 |
12 | B | 2 |
13 | A | 2 |
14 | C | 1 |
15 | B | 2 |
16 | C | 6 |
17 | C | 2 |
18 | A | 5 |
19 | B | 8 |
In [ ]:
Copied!
df.columns = ['keys','values']
df.columns = ['keys','values']
In [ ]:
Copied!
df
df
Out[ ]:
keys | values | |
---|---|---|
0 | B | 4 |
1 | C | 7 |
2 | C | 5 |
3 | B | 9 |
4 | B | 7 |
5 | B | 4 |
6 | B | 9 |
7 | B | 3 |
8 | C | 5 |
9 | C | 0 |
10 | A | 1 |
11 | B | 4 |
12 | B | 2 |
13 | A | 2 |
14 | C | 1 |
15 | B | 2 |
16 | C | 6 |
17 | C | 2 |
18 | A | 5 |
19 | B | 8 |
In [ ]:
Copied!
df.sort_values('keys')
df.sort_values('keys')
Out[ ]:
keys | values | |
---|---|---|
13 | A | 2 |
10 | A | 1 |
18 | A | 5 |
0 | B | 4 |
15 | B | 2 |
12 | B | 2 |
11 | B | 4 |
19 | B | 8 |
6 | B | 9 |
5 | B | 4 |
4 | B | 7 |
3 | B | 9 |
7 | B | 3 |
8 | C | 5 |
2 | C | 5 |
14 | C | 1 |
1 | C | 7 |
16 | C | 6 |
17 | C | 2 |
9 | C | 0 |
.groupby()¶
In [ ]:
Copied!
grps = df.groupby('keys')
grps = df.groupby('keys')
In [ ]:
Copied!
grps.get_group('A')
grps.get_group('A')
Out[ ]:
keys | values | |
---|---|---|
10 | A | 1 |
13 | A | 2 |
18 | A | 5 |
In [ ]:
Copied!
grps.agg(['count', sum])
grps.agg(['count', sum])
Out[ ]:
values | ||
---|---|---|
count | sum | |
keys | ||
A | 3 | 8 |
B | 10 | 52 |
C | 7 | 26 |
In [ ]:
Copied!
tuple(grps)
tuple(grps)
Out[ ]:
(('A', keys values 10 A 1 13 A 2 18 A 5), ('B', keys values 0 B 4 3 B 9 4 B 7 5 B 4 6 B 9 7 B 3 11 B 4 12 B 2 15 B 2 19 B 8), ('C', keys values 1 C 7 2 C 5 8 C 5 9 C 0 14 C 1 16 C 6 17 C 2))
In [ ]:
Copied!
dict(tuple(grps))
dict(tuple(grps))
Out[ ]:
{'A': keys values 10 A 1 13 A 2 18 A 5, 'B': keys values 0 B 4 3 B 9 4 B 7 5 B 4 6 B 9 7 B 3 11 B 4 12 B 2 15 B 2 19 B 8, 'C': keys values 1 C 7 2 C 5 8 C 5 9 C 0 14 C 1 16 C 6 17 C 2}
In [ ]:
Copied!
df['keys'].unique()
df['keys'].unique()
Out[ ]:
array(['B', 'C', 'A'], dtype=object)