合併DataFrame - 2/2¶
.merge()會根據特定的欄位的內容,將兩個DataFrame合併(merge)在一起。
In [1]:
Copied!
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
In [46]:
Copied!
df1 = pd.DataFrame({'key':['A','B','C'], 'data1':[1, 2, 3]})
df2 = pd.DataFrame({'key':['C','B','A'], 'data2':[4, 5, 6]})
df1 = pd.DataFrame({'key':['A','B','C'], 'data1':[1, 2, 3]})
df2 = pd.DataFrame({'key':['C','B','A'], 'data2':[4, 5, 6]})
In [65]:
Copied!
np.random.seed(456)
key1 = np.random.choice(['A','B','C'], 5)
data1 = np.arange(5)
key2 = np.random.choice(['A','B','C'], 3)
data2 = np.arange(3)
key3 = np.random.choice(['A','B','C'], 5)
data3 = np.arange(5)
key4 = np.random.choice(['A','B','C','D'], 5)
data4 = np.arange(5)
np.random.seed(456)
key1 = np.random.choice(['A','B','C'], 5)
data1 = np.arange(5)
key2 = np.random.choice(['A','B','C'], 3)
data2 = np.arange(3)
key3 = np.random.choice(['A','B','C'], 5)
data3 = np.arange(5)
key4 = np.random.choice(['A','B','C','D'], 5)
data4 = np.arange(5)
In [56]:
Copied!
df3 = pd.DataFrame({'key':key1, 'data1':data1})
df4 = pd.DataFrame({'key':key2, 'data2':data2})
df5 = pd.DataFrame({'key':key3, 'data3':data3})
df3 = pd.DataFrame({'key':key1, 'data1':data1})
df4 = pd.DataFrame({'key':key2, 'data2':data2})
df5 = pd.DataFrame({'key':key3, 'data3':data3})
In [66]:
Copied!
df6 = pd.DataFrame({'key1':key1, 'data1':data1})
df7 = pd.DataFrame({'key2':key2, 'data2':data2})
df8 = pd.DataFrame({'key3':key3, 'data3':data3})
df9 = pd.DataFrame({'key4':key4, 'data4':data4})
df6 = pd.DataFrame({'key1':key1, 'data1':data1})
df7 = pd.DataFrame({'key2':key2, 'data2':data2})
df8 = pd.DataFrame({'key3':key3, 'data3':data3})
df9 = pd.DataFrame({'key4':key4, 'data4':data4})
一對一¶
In [49]:
Copied!
df1
df1
Out[49]:
key | data1 | |
---|---|---|
0 | A | 1 |
1 | B | 2 |
2 | C | 3 |
In [50]:
Copied!
df2
df2
Out[50]:
key | data2 | |
---|---|---|
0 | C | 4 |
1 | B | 5 |
2 | A | 6 |
In [51]:
Copied!
pd.merge(df1, df2)
pd.merge(df1, df2)
Out[51]:
key | data1 | data2 | |
---|---|---|---|
0 | A | 1 | 6 |
1 | B | 2 | 5 |
2 | C | 3 | 4 |
In [68]:
Copied!
# 明確告知使用哪個欄位merge
pd.merge(df1, df2, on='key')
# 明確告知使用哪個欄位merge
pd.merge(df1, df2, on='key')
Out[68]:
key | data1 | data2 | |
---|---|---|---|
0 | A | 1 | 6 |
1 | B | 2 | 5 |
2 | C | 3 | 4 |
一對多¶
In [52]:
Copied!
df3
df3
Out[52]:
key | data1 | |
---|---|---|
0 | B | 0 |
1 | B | 1 |
2 | C | 2 |
3 | A | 3 |
4 | C | 4 |
In [53]:
Copied!
df4
df4
Out[53]:
key | data2 | |
---|---|---|
0 | C | 0 |
1 | A | 1 |
2 | B | 2 |
In [54]:
Copied!
pd.merge(df3, df4)
pd.merge(df3, df4)
Out[54]:
key | data1 | data2 | |
---|---|---|---|
0 | B | 0 | 2 |
1 | B | 1 | 2 |
2 | C | 2 | 0 |
3 | C | 4 | 0 |
4 | A | 3 | 1 |
多對多¶
In [59]:
Copied!
df5
df5
Out[59]:
key | data3 | |
---|---|---|
0 | A | 0 |
1 | B | 1 |
2 | B | 2 |
3 | A | 3 |
4 | B | 4 |
In [58]:
Copied!
pd.merge(df3, df5)
pd.merge(df3, df5)
Out[58]:
key | data1 | data3 | |
---|---|---|---|
0 | B | 0 | 1 |
1 | B | 0 | 2 |
2 | B | 0 | 4 |
3 | B | 1 | 1 |
4 | B | 1 | 2 |
5 | B | 1 | 4 |
6 | A | 3 | 0 |
7 | A | 3 | 3 |
使用指定的欄位¶
In [60]:
Copied!
df6
df6
Out[60]:
key1 | data1 | |
---|---|---|
0 | B | 0 |
1 | B | 1 |
2 | C | 2 |
3 | A | 3 |
4 | C | 4 |
In [61]:
Copied!
df7
df7
Out[61]:
key2 | data2 | |
---|---|---|
0 | C | 0 |
1 | A | 1 |
2 | B | 2 |
In [62]:
Copied!
df8
df8
Out[62]:
key3 | data3 | |
---|---|---|
0 | A | 0 |
1 | B | 1 |
2 | B | 2 |
3 | A | 3 |
4 | B | 4 |
In [64]:
Copied!
pd.merge(df6, df7, left_on='key1', right_on='key2')
pd.merge(df6, df7, left_on='key1', right_on='key2')
Out[64]:
key1 | data1 | key2 | data2 | |
---|---|---|---|---|
0 | B | 0 | B | 2 |
1 | B | 1 | B | 2 |
2 | C | 2 | C | 0 |
3 | C | 4 | C | 0 |
4 | A | 3 | A | 1 |
合併的方式¶
In [69]:
Copied!
df8
df8
Out[69]:
key3 | data3 | |
---|---|---|
0 | A | 0 |
1 | B | 1 |
2 | B | 2 |
3 | A | 3 |
4 | B | 4 |
In [70]:
Copied!
df9
df9
Out[70]:
key4 | data4 | |
---|---|---|
0 | D | 0 |
1 | A | 1 |
2 | D | 2 |
3 | B | 3 |
4 | B | 4 |
In [71]:
Copied!
pd.merge(df8, df9, left_on='key3', right_on='key4')
pd.merge(df8, df9, left_on='key3', right_on='key4')
Out[71]:
key3 | data3 | key4 | data4 | |
---|---|---|---|---|
0 | A | 0 | A | 1 |
1 | A | 3 | A | 1 |
2 | B | 1 | B | 3 |
3 | B | 1 | B | 4 |
4 | B | 2 | B | 3 |
5 | B | 2 | B | 4 |
6 | B | 4 | B | 3 |
7 | B | 4 | B | 4 |
In [72]:
Copied!
pd.merge(df8, df9, left_on='key3', right_on='key4', how='inner')
pd.merge(df8, df9, left_on='key3', right_on='key4', how='inner')
Out[72]:
key3 | data3 | key4 | data4 | |
---|---|---|---|---|
0 | A | 0 | A | 1 |
1 | A | 3 | A | 1 |
2 | B | 1 | B | 3 |
3 | B | 1 | B | 4 |
4 | B | 2 | B | 3 |
5 | B | 2 | B | 4 |
6 | B | 4 | B | 3 |
7 | B | 4 | B | 4 |
In [73]:
Copied!
pd.merge(df8, df9, left_on='key3', right_on='key4', how='outer')
pd.merge(df8, df9, left_on='key3', right_on='key4', how='outer')
Out[73]:
key3 | data3 | key4 | data4 | |
---|---|---|---|---|
0 | A | 0.0 | A | 1 |
1 | A | 3.0 | A | 1 |
2 | B | 1.0 | B | 3 |
3 | B | 1.0 | B | 4 |
4 | B | 2.0 | B | 3 |
5 | B | 2.0 | B | 4 |
6 | B | 4.0 | B | 3 |
7 | B | 4.0 | B | 4 |
8 | NaN | NaN | D | 0 |
9 | NaN | NaN | D | 2 |
In [74]:
Copied!
pd.merge(df8, df9, left_on='key3', right_on='key4', how='left')
pd.merge(df8, df9, left_on='key3', right_on='key4', how='left')
Out[74]:
key3 | data3 | key4 | data4 | |
---|---|---|---|---|
0 | A | 0 | A | 1 |
1 | B | 1 | B | 3 |
2 | B | 1 | B | 4 |
3 | B | 2 | B | 3 |
4 | B | 2 | B | 4 |
5 | A | 3 | A | 1 |
6 | B | 4 | B | 3 |
7 | B | 4 | B | 4 |
In [75]:
Copied!
pd.merge(df8, df9, left_on='key3', right_on='key4', how='right')
pd.merge(df8, df9, left_on='key3', right_on='key4', how='right')
Out[75]:
key3 | data3 | key4 | data4 | |
---|---|---|---|---|
0 | NaN | NaN | D | 0 |
1 | A | 0.0 | A | 1 |
2 | A | 3.0 | A | 1 |
3 | NaN | NaN | D | 2 |
4 | B | 1.0 | B | 3 |
5 | B | 2.0 | B | 3 |
6 | B | 4.0 | B | 3 |
7 | B | 1.0 | B | 4 |
8 | B | 2.0 | B | 4 |
9 | B | 4.0 | B | 4 |
使用index¶
In [77]:
Copied!
df10 = df6.set_index('key1')
df10 = df6.set_index('key1')
In [78]:
Copied!
df10
df10
Out[78]:
data1 | |
---|---|
key1 | |
B | 0 |
B | 1 |
C | 2 |
A | 3 |
C | 4 |
In [79]:
Copied!
df1
df1
Out[79]:
key | data1 | |
---|---|---|
0 | A | 1 |
1 | B | 2 |
2 | C | 3 |
In [80]:
Copied!
pd.merge(df10, df1, left_index=True, right_on='key')
pd.merge(df10, df1, left_index=True, right_on='key')
Out[80]:
data1_x | key | data1_y | |
---|---|---|---|
1 | 0 | B | 2 |
1 | 1 | B | 2 |
2 | 2 | C | 3 |
2 | 4 | C | 3 |
0 | 3 | A | 1 |
In [81]:
Copied!
df11 = df7.set_index('key2')
df11 = df7.set_index('key2')
In [82]:
Copied!
df11
df11
Out[82]:
data2 | |
---|---|
key2 | |
C | 0 |
A | 1 |
B | 2 |
In [83]:
Copied!
pd.merge(df10, df11, left_index=True, right_index=True)
pd.merge(df10, df11, left_index=True, right_index=True)
Out[83]:
data1 | data2 | |
---|---|---|
A | 3 | 1 |
B | 0 | 2 |
B | 1 | 2 |
C | 2 | 0 |
C | 4 | 0 |