学习笔记——chap7"/>
利用Python进行数据分析的学习笔记——chap7
数据规整化:清理、转换、合并、重塑
合并数据集
pandas.merge可根据一个或多个键将不同DataFrame中的行连接起来。
pandas.concat可以沿着一条轴将多个对象堆叠到一起。
实例方法combine_first可以将重复数据编接在一起,用一个对象中的值填充另一个对象中的缺失值。
数据库风格的DataFrame合并
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df2 = DataFrame({'key':['a','b','d'],'data2':range(3)})
df1
| key | data1 |
---|
0 | b | 0 |
---|
1 | b | 1 |
---|
2 | a | 2 |
---|
3 | c | 3 |
---|
4 | a | 4 |
---|
5 | a | 5 |
---|
6 | b | 6 |
---|
df2
pd.merge(df1,df2)#未指定情况下,将重叠列的列名当做键。
| key | data1 | data2 |
---|
0 | b | 0 | 1 |
---|
1 | b | 1 | 1 |
---|
2 | b | 6 | 1 |
---|
3 | a | 2 | 0 |
---|
4 | a | 4 | 0 |
---|
5 | a | 5 | 0 |
---|
pd.merge(df1,df2,on='key')
| key | data1 | data2 |
---|
0 | b | 0 | 1 |
---|
1 | b | 1 | 1 |
---|
2 | b | 6 | 1 |
---|
3 | a | 2 | 0 |
---|
4 | a | 4 | 0 |
---|
5 | a | 5 | 0 |
---|
#如果两个对象的列名不同,分别进行指定
df3 = DataFrame({'lkey':['b','b','a','c','a','a','b'],'data1':range(7)})
df4 = DataFrame({'rkey':['a','b','d'],'data2':range(3)})
#默认下merge是inner连接,结果未交集。还有left,right,outer。外连接取得是键得并集,组合了左连接和右连接得效果。
pd.merge(df3,df4,left_on='lkey',right_on='rkey')
| lkey | data1 | rkey | data2 |
---|
0 | b | 0 | b | 1 |
---|
1 | b | 1 | b | 1 |
---|
2 | b | 6 | b | 1 |
---|
3 | a | 2 | a | 0 |
---|
4 | a | 4 | a | 0 |
---|
5 | a | 5 | a | 0 |
---|
pd.merge(df1,df2,how='outer')
| key | data1 | data2 |
---|
0 | b | 0.0 | 1.0 |
---|
1 | b | 1.0 | 1.0 |
---|
2 | b | 6.0 | 1.0 |
---|
3 | a | 2.0 | 0.0 |
---|
4 | a | 4.0 | 0.0 |
---|
5 | a | 5.0 | 0.0 |
---|
6 | c | 3.0 | NaN |
---|
7 | d | NaN | 2.0 |
---|
#多对多的合并
df1 = DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df2 = DataFrame({'key':['a','b','a','b','d'],'data2':range(5)})
df1
| key | data1 |
---|
0 | b | 0 |
---|
1 | b | 1 |
---|
2 | a | 2 |
---|
3 | c | 3 |
---|
4 | a | 4 |
---|
5 | b | 5 |
---|
df2
pd.merge(df1,df2,on='key',how='left')#多对多连接产生的是行的笛卡尔积
| key | data1 | data2 |
---|
0 | b | 0 | 1.0 |
---|
1 | b | 0 | 3.0 |
---|
2 | b | 1 | 1.0 |
---|
3 | b | 1 | 3.0 |
---|
4 | a | 2 | 0.0 |
---|
5 | a | 2 | 2.0 |
---|
6 | c | 3 | NaN |
---|
7 | a | 4 | 0.0 |
---|
8 | a | 4 | 2.0 |
---|
9 | b | 5 | 1.0 |
---|
10 | b | 5 | 3.0 |
---|
pd.merge(df1,df2,how='inner')
| key | data1 | data2 |
---|
0 | b | 0 | 1 |
---|
1 | b | 0 | 3 |
---|
2 | b | 1 | 1 |
---|
3 | b | 1 | 3 |
---|
4 | b | 5 | 1 |
---|
5 | b | 5 | 3 |
---|
6 | a | 2 | 0 |
---|
7 | a | 2 | 2 |
---|
8 | a | 4 | 0 |
---|
9 | a | 4 | 2 |
---|
#根据多个键进行合并
left = DataFrame({'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
right = DataFrame({'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})
pd.merge(left,right,on=['key1','key2'],how='outer')
| key1 | key2 | lval | rval |
---|
0 | foo | one | 1.0 | 4.0 |
---|
1 | foo | one | 1.0 | 5.0 |
---|
2 | foo | two | 2.0 | NaN |
---|
3 | bar | one | 3.0 | 6.0 |
---|
4 | bar | two | NaN | 7.0 |
---|
#对重复列名的处理
pd.merge(left,right,on='key1')
| key1 | key2_x | lval | key2_y | rval |
---|
0 | foo | one | 1 | one | 4 |
---|
1 | foo | one | 1 | one | 5 |
---|
2 | foo | two | 2 | one | 4 |
---|
3 | foo | two | 2 | one | 5 |
---|
4 | bar | one | 3 | one | 6 |
---|
5 | bar | one | 3 | two | 7 |
---|
pd.merge(left,right,on='key1',suffixes=('_left','_right'))
| key1 | key2_left | lval | key2_right | rval |
---|
0 | foo | one | 1 | one | 4 |
---|
1 | foo | one | 1 | one | 5 |
---|
2 | foo | two | 2 | one | 4 |
---|
3 | foo | two | 2 | one | 5 |
---|
4 | bar | one | 3 | one | 6 |
---|
5 | bar | one | 3 | two | 7 |
---|
索引上的合并
#索引用作连接键
left1 = DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])
left1
| key | value |
---|
0 | a | 0 |
---|
1 | b | 1 |
---|
2 | a | 2 |
---|
3 | a | 3 |
---|
4 | b | 4 |
---|
5 | c | 5 |
---|
right1
pd.merge(left1,right1,left_on='key',right_index=True)
| key | value | group_val |
---|
0 | a | 0 | 3.5 |
---|
2 | a | 2 | 3.5 |
---|
3 | a | 3 | 3.5 |
---|
1 | b | 1 | 7.0 |
---|
4 | b | 4 | 7.0 |
---|
#通过外连接的方式得到并集
pd.merge(left1,right1,left_on='key',right_index=True,how='outer')
| key | value | group_val |
---|
0 | a | 0 | 3.5 |
---|
2 | a | 2 | 3.5 |
---|
3 | a | 3 | 3.5 |
---|
1 | b | 1 | 7.0 |
---|
4 | b | 4 | 7.0 |
---|
5 | c | 5 | NaN |
---|
#层次化索引
lefth = DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],'key2':[2000,2001,2002,2001,2002],'data':np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],[2001,2000,2000,2000,2001,2002]],columns=['event1','event2'])
lefth
| key1 | key2 | data |
---|
0 | Ohio | 2000 | 0.0 |
---|
1 | Ohio | 2001 | 1.0 |
---|
2 | Ohio | 2002 | 2.0 |
---|
3 | Nevada | 2001 | 3.0 |
---|
4 | Nevada | 2002 | 4.0 |
---|
righth
| | event1 | event2 |
---|
Nevada | 2001 | 0 | 1 |
---|
2000 | 2 | 3 |
---|
Ohio | 2000 | 4 | 5 |
---|
2000 | 6 | 7 |
---|
2001 | 8 | 9 |
---|
2002 | 10 | 11 |
---|
#以列表的形式指明用作合并键的多个列
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
| key1 | key2 | data | event1 | event2 |
---|
0 | Ohio | 2000 | 0.0 | 4 | 5 |
---|
0 | Ohio | 2000 | 0.0 | 6 | 7 |
---|
1 | Ohio | 2001 | 1.0 | 8 | 9 |
---|
2 | Ohio | 2002 | 2.0 | 10 | 11 |
---|
3 | Nevada | 2001 | 3.0 | 0 | 1 |
---|
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')
| key1 | key2 | data | event1 | event2 |
---|
0 | Ohio | 2000 | 0.0 | 4.0 | 5.0 |
---|
0 | Ohio | 2000 | 0.0 | 6.0 | 7.0 |
---|
1 | Ohio | 2001 | 1.0 | 8.0 | 9.0 |
---|
2 | Ohio | 2002 | 2.0 | 10.0 | 11.0 |
---|
3 | Nevada | 2001 | 3.0 | 0.0 | 1.0 |
---|
4 | Nevada | 2002 | 4.0 | NaN | NaN |
---|
4 | Nevada | 2000 | NaN | 2.0 | 3.0 |
---|
#使用合并双方的索引
left2 = DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['Ohio','Nevada'])
right2 = DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
left2
| Ohio | Nevada |
---|
a | 1.0 | 2.0 |
---|
c | 3.0 | 4.0 |
---|
e | 5.0 | 6.0 |
---|
right2
| Missouri | Alabama |
---|
b | 7.0 | 8.0 |
---|
c | 9.0 | 10.0 |
---|
d | 11.0 | 12.0 |
---|
e | 13.0 | 14.0 |
---|
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
| Ohio | Nevada | Missouri | Alabama |
---|
a | 1.0 | 2.0 | NaN | NaN |
---|
b | NaN | NaN | 7.0 | 8.0 |
---|
c | 3.0 | 4.0 | 9.0 | 10.0 |
---|
d | NaN | NaN | 11.0 | 12.0 |
---|
e | 5.0 | 6.0 | 13.0 | 14.0 |
---|
#使用join方法更方便实现按索引合并
left2.join(right2,how='outer')
| Ohio | Nevada | Missouri | Alabama |
---|
a | 1.0 | 2.0 | NaN | NaN |
---|
b | NaN | NaN | 7.0 | 8.0 |
---|
c | 3.0 | 4.0 | 9.0 | 10.0 |
---|
d | NaN | NaN | 11.0 | 12.0 |
---|
e | 5.0 | 6.0 | 13.0 | 14.0 |
---|
left1.join(right1,on='key')
| key | value | group_val |
---|
0 | a | 0 | 3.5 |
---|
1 | b | 1 | 7.0 |
---|
2 | a | 2 | 3.5 |
---|
3 | a | 3 | 3.5 |
---|
4 | b | 4 | 7.0 |
---|
5 | c | 5 | NaN |
---|
#还可以向join传入一组DataFrame
another = DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17]],index=['a','c','e','f'],columns=['New York','Oregon'])
left2.join([right2,another])
| Ohio | Nevada | Missouri | Alabama | New York | Oregon |
---|
a | 1.0 | 2.0 | NaN | NaN | 7.0 | 8.0 |
---|
c | 3.0 | 4.0 | 9.0 | 10.0 | 9.0 | 10.0 |
---|
e | 5.0 | 6.0 | 13.0 | 14.0 | 11.0 | 12.0 |
---|
left2.join([right2,another],how='outer')
| Ohio | Nevada | Missouri | Alabama | New York | Oregon |
---|
a | 1.0 | 2.0 | NaN | NaN | 7.0 | 8.0 |
---|
c | 3.0 | 4.0 | 9.0 | 10.0 | 9.0 | 10.0 |
---|
e | 5.0 | 6.0 | 13.0 | 14.0 | 11.0 | 12.0 |
---|
b | NaN | NaN | 7.0 | 8.0 | NaN | NaN |
---|
d | NaN | NaN | 11.0 | 12.0 | NaN | NaN |
---|
f | NaN | NaN | NaN | NaN | 16.0 | 17.0 |
---|
轴向连接
#Numpy的concatenate函数
arr = np.arange(12).reshape((3,4))
arr
array([[ 0, 1, 2, 3],[ 4, 5, 6, 7],[ 8, 9, 10, 11]])
np.concatenate([arr,arr],axis=1)
array([[ 0, 1, 2, 3, 0, 1, 2, 3],[ 4, 5, 6, 7, 4, 5, 6, 7],[ 8, 9, 10, 11, 8, 9, 10, 11]])
#pandas的concat函数,默认情况下在axis=0上工作
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
#若传入axis=1(是列),则结果会变成一个DataFrame
pd.concat([s1,s2,s3],axis=1)
| 0 | 1 | 2 |
---|
a | 0.0 | NaN | NaN |
---|
b | 1.0 | NaN | NaN |
---|
c | NaN | 2.0 | NaN |
---|
d | NaN | 3.0 | NaN |
---|
e | NaN | 4.0 | NaN |
---|
f | NaN | NaN | 5.0 |
---|
g | NaN | NaN | 6.0 |
---|
s4 = pd.concat([s1*5,s3])
s4
a 0
b 5
f 5
g 6
dtype: int64
pd.concat([s1,s4],axis=1)
pd.concat([s1,s4],axis=1,join='inner')
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])#在使用Pandas_Profiling.ProfilingReport报错," concat() got an unexpected keyword argument ‘join_axes’ "
#经查是pandas和Pandas_Profiling版本过低导致,
#在命令行更新 pip install --upgrade pandas
#pip install --upgrade pandas_Profiling(这一步出错)
---------------------------------------------------------------------------
TypeError: concat() got an unexpected keyword argument 'join_axes'
#在连接轴上创建一个层次化索引
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result
one a 0b 1
two a 0b 1
three f 5g 6
dtype: int64
result.unstack()
| a | b | f | g |
---|
one | 0.0 | 1.0 | NaN | NaN |
---|
two | 0.0 | 1.0 | NaN | NaN |
---|
three | NaN | NaN | 5.0 | 6.0 |
---|
#沿着axis=1对Series进行合并,则keys会称为DataFrame的列头
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
| one | two | three |
---|
a | 0.0 | NaN | NaN |
---|
b | 1.0 | NaN | NaN |
---|
c | NaN | 2.0 | NaN |
---|
d | NaN | 3.0 | NaN |
---|
e | NaN | 4.0 | NaN |
---|
f | NaN | NaN | 5.0 |
---|
g | NaN | NaN | 6.0 |
---|
#同理,对DataFrame也是一样
df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df2 = DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
| level1 | level2 |
---|
| one | two | three | four |
---|
a | 0 | 1 | 5.0 | 6.0 |
---|
b | 2 | 3 | NaN | NaN |
---|
c | 4 | 5 | 7.0 | 8.0 |
---|
#对于字典,其键会被当做keys选项的值
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
| level1 | level2 |
---|
| one | two | three | four |
---|
a | 0 | 1 | 5.0 | 6.0 |
---|
b | 2 | 3 | NaN | NaN |
---|
c | 4 | 5 | 7.0 | 8.0 |
---|
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])
upper | level1 | level2 |
---|
lower | one | two | three | four |
---|
a | 0 | 1 | 5.0 | 6.0 |
---|
b | 2 | 3 | NaN | NaN |
---|
c | 4 | 5 | 7.0 | 8.0 |
---|
df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2 = DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df1
| a | b | c | d |
---|
0 | 0.909729 | -1.166310 | -0.988668 | -0.398099 |
---|
1 | 1.226715 | 1.429447 | -0.353615 | 1.096639 |
---|
2 | 0.420715 | 1.319891 | -2.096798 | -0.125654 |
---|
df2
| b | d | a |
---|
0 | -0.077881 | 0.031641 | -0.839706 |
---|
1 | -0.231105 | 1.227890 | 0.422392 |
---|
pd.concat([df1,df2],ignore_index=True)
| a | b | c | d |
---|
0 | 0.909729 | -1.166310 | -0.988668 | -0.398099 |
---|
1 | 1.226715 | 1.429447 | -0.353615 | 1.096639 |
---|
2 | 0.420715 | 1.319891 | -2.096798 | -0.125654 |
---|
3 | -0.839706 | -0.077881 | NaN | 0.031641 |
---|
4 | 0.422392 | -0.231105 | NaN | 1.227890 |
---|
合并重叠数据
#Numpy的where函数,用于表达一种矢量化的if-else
a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
b = Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b[-1] = np.nan
a
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
b
f 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a NaN
dtype: float64
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 2. , 3.5, 4.5, nan])
#另一种方法
b[:-2]bine_first(a[2:])
a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64
### 用参数对象中的数据为调用者对象的缺失数据“打补丁”
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],'b':[np.nan,2.,np.nan,6.],'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],'b':[np.nan,3.,4.,6.,8.]})
df1bine_first(df2)
| a | b | c |
---|
0 | 1.0 | NaN | 2.0 |
---|
1 | 4.0 | 2.0 | 6.0 |
---|
2 | 5.0 | 4.0 | 10.0 |
---|
3 | 3.0 | 6.0 | 14.0 |
---|
4 | 7.0 | 8.0 | NaN |
---|
| a | b | c |
---|
0 | 1.0 | NaN | 2 |
---|
1 | NaN | 2.0 | 6 |
---|
2 | 5.0 | NaN | 10 |
---|
3 | NaN | 6.0 | 14 |
---|
df2
| a | b |
---|
0 | 5.0 | NaN |
---|
1 | 4.0 | 3.0 |
---|
2 | NaN | 4.0 |
---|
3 | 3.0 | 6.0 |
---|
4 | 7.0 | 8.0 |
---|
重塑和轴向旋转
重塑层次化索引
stack:将数据的列旋转为行
unstack:将数据的行旋转为列
data = DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
data
number | one | two | three |
---|
state | | | |
---|
Ohio | 0 | 1 | 2 |
---|
Colorado | 3 | 4 | 5 |
---|
result = data.stack()
result
state number
Ohio one 0two 1three 2
Colorado one 3two 4three 5
dtype: int32
result.unstack()
number | one | two | three |
---|
state | | | |
---|
Ohio | 0 | 1 | 2 |
---|
Colorado | 3 | 4 | 5 |
---|
#传入分层级别的编号或名称即可对其他级别进行unstack操作
result.unstack(0)
state | Ohio | Colorado |
---|
number | | |
---|
one | 0 | 3 |
---|
two | 1 | 4 |
---|
three | 2 | 5 |
---|
result.unstack('state')
state | Ohio | Colorado |
---|
number | | |
---|
one | 0 | 3 |
---|
two | 1 | 4 |
---|
three | 2 | 5 |
---|
s1 = Series([0,1,2,3],index=['a','b','c','d'])
s2 = Series([4,5,6],index=['c','d','e'])
data2 = pd.concat([s1,s2],keys=['one','two'])
data2.unstack()
| a | b | c | d | e |
---|
one | 0.0 | 1.0 | 2.0 | 3.0 | NaN |
---|
two | NaN | NaN | 4.0 | 5.0 | 6.0 |
---|
data2.unstack().stack()
one a 0.0b 1.0c 2.0d 3.0
two c 4.0d 5.0e 6.0
dtype: float64
data2.unstack().stack(dropna=False)
one a 0.0b 1.0c 2.0d 3.0e NaN
two a NaNb NaNc 4.0d 5.0e 6.0
dtype: float64
#在对DataFrame进行unstack操作时,作为旋转轴的级别将会成为结果中的最低级别
df = DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df
| side | left | right |
---|
state | number | | |
---|
Ohio | one | 0 | 5 |
---|
two | 1 | 6 |
---|
three | 2 | 7 |
---|
Colorado | one | 3 | 8 |
---|
two | 4 | 9 |
---|
three | 5 | 10 |
---|
df.unstack('state')
side | left | right |
---|
state | Ohio | Colorado | Ohio | Colorado |
---|
number | | | | |
---|
one | 0 | 3 | 5 | 8 |
---|
two | 1 | 4 | 6 | 9 |
---|
three | 2 | 5 | 7 | 10 |
---|
df.unstack('state').stack('side')
| state | Colorado | Ohio |
---|
number | side | | |
---|
one | left | 3 | 0 |
---|
right | 8 | 5 |
---|
two | left | 4 | 1 |
---|
right | 9 | 6 |
---|
three | left | 5 | 2 |
---|
right | 10 | 7 |
---|
将“长格式”旋转为“宽格式”
# 这里的ldata为自己生成,与原文不一致,只实验方法
ldata = DataFrame({'date':['1959-03-31','1959-03-31','1959-03-31','1959-06-30','1959-06-30','1959-06-30','1959-09-30', '1959-09-30','1959-09-30'],'item':['realgdp', 'infl', 'unemp']*3,'value':[2710.349, 0.0, 5.8]*3})
print(ldata)
date item value
0 1959-03-31 realgdp 2710.349
1 1959-03-31 infl 0.000
2 1959-03-31 unemp 5.800
3 1959-06-30 realgdp 2710.349
4 1959-06-30 infl 0.000
5 1959-06-30 unemp 5.800
6 1959-09-30 realgdp 2710.349
7 1959-09-30 infl 0.000
8 1959-09-30 unemp 5.800
pivoted = ldata.pivot('date','item','value')#前两个参数值分别用作行和列索引的列名,最后一个参数值则是用于填充DataFrame的数据列的列名
pivoted.head()
item | infl | realgdp | unemp |
---|
date | | | |
---|
1959-03-31 | 0.0 | 2710.349 | 5.8 |
---|
1959-06-30 | 0.0 | 2710.349 | 5.8 |
---|
1959-09-30 | 0.0 | 2710.349 | 5.8 |
---|
ldata['value2'] = np.random.randn(len(ldata))
ldata
| date | item | value | value2 |
---|
0 | 1959-03-31 | realgdp | 2710.349 | 0.576713 |
---|
1 | 1959-03-31 | infl | 0.000 | 0.164654 |
---|
2 | 1959-03-31 | unemp | 5.800 | -0.309467 |
---|
3 | 1959-06-30 | realgdp | 2710.349 | 1.321802 |
---|
4 | 1959-06-30 | infl | 0.000 | 0.964655 |
---|
5 | 1959-06-30 | unemp | 5.800 | -0.355550 |
---|
6 | 1959-09-30 | realgdp | 2710.349 | -0.303973 |
---|
7 | 1959-09-30 | infl | 0.000 | 0.791143 |
---|
8 | 1959-09-30 | unemp | 5.800 | 0.985764 |
---|
pivoted = ldata.pivot('date','item')
pivoted
| value | value2 |
---|
item | infl | realgdp | unemp | infl | realgdp | unemp |
---|
date | | | | | | |
---|
1959-03-31 | 0.0 | 2710.349 | 5.8 | 0.164654 | 0.576713 | -0.309467 |
---|
1959-06-30 | 0.0 | 2710.349 | 5.8 | 0.964655 | 1.321802 | -0.355550 |
---|
1959-09-30 | 0.0 | 2710.349 | 5.8 | 0.791143 | -0.303973 | 0.985764 |
---|
pivoted['value']
item | infl | realgdp | unemp |
---|
date | | | |
---|
1959-03-31 | 0.0 | 2710.349 | 5.8 |
---|
1959-06-30 | 0.0 | 2710.349 | 5.8 |
---|
1959-09-30 | 0.0 | 2710.349 | 5.8 |
---|
#用set_index创建层次化索引,再用unstack重塑
unstacked = ldata.set_index(['date','item']).unstack('item')
unstacked
| value | value2 |
---|
item | infl | realgdp | unemp | infl | realgdp | unemp |
---|
date | | | | | | |
---|
1959-03-31 | 0.0 | 2710.349 | 5.8 | 0.164654 | 0.576713 | -0.309467 |
---|
1959-06-30 | 0.0 | 2710.349 | 5.8 | 0.964655 | 1.321802 | -0.355550 |
---|
1959-09-30 | 0.0 | 2710.349 | 5.8 | 0.791143 | -0.303973 | 0.985764 |
---|
数据转换
移除重复数据
data = DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
data
| k1 | k2 |
---|
0 | one | 1 |
---|
1 | one | 1 |
---|
2 | one | 2 |
---|
3 | two | 3 |
---|
4 | two | 3 |
---|
5 | two | 4 |
---|
6 | two | 4 |
---|
#DataFrame的duplicated方法返回一个布尔型Series,表示各行是否时重复行
data.duplicated()
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
data.drop_duplicates()#返回一个移除了重复行的DataFrame
data['v1'] = range(7)
data.drop_duplicates(['k1'])
#duplicated和drop_duplicates默认保留的是第一个出现的值组合
data.drop_duplicates(['k1','k2'],keep='last')#take_last=True则保留最后一个.将 take_last=True 改为 keep='last' 就可以了
| k1 | k2 | v1 |
---|
1 | one | 1 | 1 |
---|
2 | one | 2 | 2 |
---|
4 | two | 3 | 4 |
---|
6 | two | 4 | 6 |
---|
利用函数或映射进行数据转换
data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
| food | ounces |
---|
0 | bacon | 4.0 |
---|
1 | pulled pork | 3.0 |
---|
2 | bacon | 12.0 |
---|
3 | Pastrami | 6.0 |
---|
4 | corned beef | 7.5 |
---|
5 | Bacon | 8.0 |
---|
6 | pastrami | 3.0 |
---|
7 | honey ham | 5.0 |
---|
8 | nova lox | 6.0 |
---|
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'
}
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data
| food | ounces | animal |
---|
0 | bacon | 4.0 | pig |
---|
1 | pulled pork | 3.0 | pig |
---|
2 | bacon | 12.0 | pig |
---|
3 | Pastrami | 6.0 | cow |
---|
4 | corned beef | 7.5 | cow |
---|
5 | Bacon | 8.0 | pig |
---|
6 | pastrami | 3.0 | cow |
---|
7 | honey ham | 5.0 | pig |
---|
8 | nova lox | 6.0 | salmon |
---|
data['food'].map(lambda x: meat_to_animal[x.lower()])
#使用map是一种实现元素级转换以及其他数据清理工作的便捷方式
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
替换值
data = Series([1.,-999.,2.,-999.,-1000.,3.])
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data.replace(-999,np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
#一次性替换多个值,传入一个由待替换值组成的列表以及一个替换值
data.replace([-999,-1000],np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
#对不同的值进行不同的替换
data.replace([-999,-1000],[np.nan,0])
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
#传入的参数也可以是字典
data.replace({-999:np.nan,-1000:0})
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
重命名轴索引
data = DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
data.index.map(str.upper)
Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')
data.index = data.index.map(str.upper)
data
| one | two | three | four |
---|
OHIO | 0 | 1 | 2 | 3 |
---|
COLORADO | 4 | 5 | 6 | 7 |
---|
NEW YORK | 8 | 9 | 10 | 11 |
---|
#如果想要创建数据集的转换版(而不是修改原始数据),比较实用的方法是rename
data.rename(index=str.title,columns=str.upper)
| ONE | TWO | THREE | FOUR |
---|
Ohio | 0 | 1 | 2 | 3 |
---|
Colorado | 4 | 5 | 6 | 7 |
---|
New York | 8 | 9 | 10 | 11 |
---|
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})
| one | two | peekaboo | four |
---|
INDIANA | 0 | 1 | 2 | 3 |
---|
COLORADO | 4 | 5 | 6 | 7 |
---|
NEW YORK | 8 | 9 | 10 | 11 |
---|
_ = data.rename(index={'OHIO':'INDIANA'},inplace=True)
data
| one | two | three | four |
---|
INDIANA | 0 | 1 | 2 | 3 |
---|
COLORADO | 4 | 5 | 6 | 7 |
---|
NEW YORK | 8 | 9 | 10 | 11 |
---|
离散化和面元划分
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
#labels属性停止使用,现在为Categorical.codes,cats.labels
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
#levels属性停止使用,现在为Categorical.categories
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
pd.value_counts(cats)
(18, 25] 5
(25, 35] 3
(35, 60] 3
(60, 100] 1
dtype: int64
#哪边是闭端可以通过right=False进行修改
pd.cut(ages,[18,26,36,61,100],right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
#可以设置自己的面元名称
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
#如果向cut传入的是面元的数量而不是确切的面元边界,则它会根据数据的最小值和最大值计算等长面元
data = np.random.rand(20)#生成均匀分布的数据
pd.cut(data,4,precision=2)
[(0.031, 0.27], (0.75, 1.0], (0.27, 0.51], (0.031, 0.27], (0.031, 0.27], ..., (0.27, 0.51], (0.031, 0.27], (0.75, 1.0], (0.75, 1.0], (0.031, 0.27]]
Length: 20
Categories (4, interval[float64, right]): [(0.031, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 1.0]]
#qcut可以根据样本分位数对数据进行面元划分
data = np.random.randn(1000)#正态分布
cats = pd.qcut(data,4)#按四分位数进行切割
cats
[(0.699, 2.801], (0.699, 2.801], (0.699, 2.801], (0.0346, 0.699], (-0.703, 0.0346], ..., (-2.582, -0.703], (-0.703, 0.0346], (-2.582, -0.703], (0.699, 2.801], (0.699, 2.801]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -0.703] < (-0.703, 0.0346] < (0.0346, 0.699] < (0.699, 2.801]]
pd.value_counts(cats)
(-2.582, -0.703] 250
(-0.703, 0.0346] 250
(0.0346, 0.699] 250
(0.699, 2.801] 250
dtype: int64
#设置自定义的分位数
pd.qcut(data,[0,0.1,0.5,0.9,1.])
[(1.263, 2.801], (0.0346, 1.263], (1.263, 2.801], (0.0346, 1.263], (-1.274, 0.0346], ..., (-2.582, -1.274], (-1.274, 0.0346], (-1.274, 0.0346], (0.0346, 1.263], (0.0346, 1.263]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.582, -1.274] < (-1.274, 0.0346] < (0.0346, 1.263] < (1.263, 2.801]]
检测和过滤异常值
np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
data.describe()
| 0 | 1 | 2 | 3 |
---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
---|
mean | -0.067684 | 0.067924 | 0.025598 | -0.002298 |
---|
std | 0.998035 | 0.992106 | 1.006835 | 0.996794 |
---|
min | -3.428254 | -3.548824 | -3.184377 | -3.745356 |
---|
25% | -0.774890 | -0.591841 | -0.641675 | -0.644144 |
---|
50% | -0.116401 | 0.101143 | 0.002073 | -0.013611 |
---|
75% | 0.616366 | 0.780282 | 0.680391 | 0.654328 |
---|
max | 3.366626 | 2.653656 | 3.260383 | 3.927528 |
---|
col = data[3]
col[np.abs(col)>3]
97 3.927528
305 -3.399312
400 -3.745356
Name: 3, dtype: float64
#选出全部含有“超过3或-3的值”的行
data[(np.abs(data)>3).any(1)]
| 0 | 1 | 2 | 3 |
---|
5 | -0.539741 | 0.476985 | 3.248944 | -1.021228 |
---|
97 | -0.774363 | 0.552936 | 0.106061 | 3.927528 |
---|
102 | -0.655054 | -0.565230 | 3.176873 | 0.959533 |
---|
305 | -2.315555 | 0.457246 | -0.025907 | -3.399312 |
---|
324 | 0.050188 | 1.951312 | 3.260383 | 0.963301 |
---|
400 | 0.146326 | 0.508391 | -0.196713 | -3.745356 |
---|
499 | -0.293333 | -0.242459 | -3.056990 | 1.918403 |
---|
523 | -3.428254 | -0.296336 | -0.439938 | -0.867165 |
---|
586 | 0.275144 | 1.179227 | -3.184377 | 1.369891 |
---|
808 | -0.362528 | -3.548824 | 1.553205 | -2.186301 |
---|
900 | 3.366626 | -2.372214 | 0.851010 | 1.332846 |
---|
#将值限制在区间-3到3以内
data[np.abs(data)>3] = np.sign(data)*3
data.describe()
| 0 | 1 | 2 | 3 |
---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
---|
mean | -0.067623 | 0.068473 | 0.025153 | -0.002081 |
---|
std | 0.995485 | 0.990253 | 1.003977 | 0.989736 |
---|
min | -3.000000 | -3.000000 | -3.000000 | -3.000000 |
---|
25% | -0.774890 | -0.591841 | -0.641675 | -0.644144 |
---|
50% | -0.116401 | 0.101143 | 0.002073 | -0.013611 |
---|
75% | 0.616366 | 0.780282 | 0.680391 | 0.654328 |
---|
max | 3.000000 | 2.653656 | 3.000000 | 3.000000 |
---|
排列和随机采样
df = DataFrame(np.arange(5*4).reshape(5,4))
#实现对Series或DataFrame的列的排列工作(随机重排序)
sampler = np.random.permutation(5)
sampler
array([1, 0, 2, 3, 4])
df
| 0 | 1 | 2 | 3 |
---|
0 | 0 | 1 | 2 | 3 |
---|
1 | 4 | 5 | 6 | 7 |
---|
2 | 8 | 9 | 10 | 11 |
---|
3 | 12 | 13 | 14 | 15 |
---|
4 | 16 | 17 | 18 | 19 |
---|
df.take(sampler)
| 0 | 1 | 2 | 3 |
---|
1 | 4 | 5 | 6 | 7 |
---|
0 | 0 | 1 | 2 | 3 |
---|
2 | 8 | 9 | 10 | 11 |
---|
3 | 12 | 13 | 14 | 15 |
---|
4 | 16 | 17 | 18 | 19 |
---|
#另一种非替换的方式
df.take(np.random.permutation(len(df))[:3])
| 0 | 1 | 2 | 3 |
---|
1 | 4 | 5 | 6 | 7 |
---|
3 | 12 | 13 | 14 | 15 |
---|
4 | 16 | 17 | 18 | 19 |
---|
#要通过替换的方式产生样本,最快的方式是通过np.random.randint得到一组随机整数
bag = np.array([5,7,-1,6,4])
sampler = np.random.randint(0,len(bag),size=10)
sampler
array([4, 4, 2, 2, 2, 0, 3, 0, 4, 1])
draws = bag.take(sampler)#take(m,1)意思是取每一行的第m个值
draws
array([ 4, 4, -1, -1, -1, 5, 6, 5, 4, 7])
计算指标/哑变量
如果DataFrame的某一列中含有k个不同的值,则可以派生初一个k列矩阵或DataFrame(其值全为1和0)
pandas有一个get_dummies函数可以实现该功能
df = DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
pd.get_dummies(df['key'])
| a | b | c |
---|
0 | 0 | 1 | 0 |
---|
1 | 0 | 1 | 0 |
---|
2 | 1 | 0 | 0 |
---|
3 | 0 | 0 | 1 |
---|
4 | 1 | 0 | 0 |
---|
5 | 0 | 1 | 0 |
---|
#给指标DataFrame的列加上一个前缀
dummies = pd.get_dummies(df['key'],prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
| data1 | key_a | key_b | key_c |
---|
0 | 0 | 0 | 1 | 0 |
---|
1 | 1 | 0 | 1 | 0 |
---|
2 | 2 | 1 | 0 | 0 |
---|
3 | 3 | 0 | 0 | 1 |
---|
4 | 4 | 1 | 0 | 0 |
---|
5 | 5 | 0 | 1 | 0 |
---|
#如果DataFrame中的某行同属于多个分类
mnames = ['movie_id','title','genres']
movies = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat",sep='::',header=None,names=mnames)
movies[:10]
C:\windows\Temp/ipykernel_6352/824848908.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.movies = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\datasets\movielens\movies.dat",sep='::',header=None,names=mnames)
| movie_id | title | genres |
---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
---|
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
---|
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
---|
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
---|
4 | 5 | Father of the Bride Part II (1995) | Comedy |
---|
5 | 6 | Heat (1995) | Action|Crime|Thriller |
---|
6 | 7 | Sabrina (1995) | Comedy|Romance |
---|
7 | 8 | Tom and Huck (1995) | Adventure|Children's |
---|
8 | 9 | Sudden Death (1995) | Action |
---|
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
---|
#要为每个genre添加指标变量
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
dummies =DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
#迭代每一部电影并将dummies各行的项设置为1
for i, gen in enumerate(movies.genres):dummies.loc[i,gen.split('|')] = 1
#将其与movies合并起来
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.loc[0]
movie_id 1
title Toy Story (1995)
genres Animation|Children's|Comedy
Genre_Action 0.0
Genre_Adventure 0.0
Genre_Animation 1.0
Genre_Children's 1.0
Genre_Comedy 1.0
Genre_Crime 0.0
Genre_Documentary 0.0
Genre_Drama 0.0
Genre_Fantasy 0.0
Genre_Film-Noir 0.0
Genre_Horror 0.0
Genre_Musical 0.0
Genre_Mystery 0.0
Genre_Romance 0.0
Genre_Sci-Fi 0.0
Genre_Thriller 0.0
Genre_War 0.0
Genre_Western 0.0
Name: 0, dtype: object
#结合cut之类的离散化函数
values = np.random.rand(10)
values
array([0.75603383, 0.90830844, 0.96588737, 0.17373658, 0.87592824,0.75415641, 0.163486 , 0.23784062, 0.85564381, 0.58743194])
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))
| (0.0, 0.2] | (0.2, 0.4] | (0.4, 0.6] | (0.6, 0.8] | (0.8, 1.0] |
---|
0 | 0 | 0 | 0 | 1 | 0 |
---|
1 | 0 | 0 | 0 | 0 | 1 |
---|
2 | 0 | 0 | 0 | 0 | 1 |
---|
3 | 1 | 0 | 0 | 0 | 0 |
---|
4 | 0 | 0 | 0 | 0 | 1 |
---|
5 | 0 | 0 | 0 | 1 | 0 |
---|
6 | 1 | 0 | 0 | 0 | 0 |
---|
7 | 0 | 1 | 0 | 0 | 0 |
---|
8 | 0 | 0 | 0 | 0 | 1 |
---|
9 | 0 | 0 | 1 | 0 | 0 |
---|
字符串操作
字符串对象方法
val = 'a,b, guido'
val.split(',')
['a', 'b', ' guido']
#strip(用于修剪空白符(包括换行符))
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
'guido' in val
True
val.index(',')
1
val.find(':')
-1
#find和index的区别:如果找不到字符串,index将会引发一个异常(而不是返回-1)
val.index(':')
---------------------------------------------------------------------------ValueError Traceback (most recent call last)C:\windows\Temp/ipykernel_13684/2927268062.py in <module>
----> 1 val.index(':')ValueError: substring not found
#返回指定子串的出现次数
val.count(',')
2
val.replace(',','::')
'a::b:: guido'
val.replace(',','')
'ab guido'
正则表达式
正则表达式提供了一种灵活的在文本中搜索或匹配字符串模式的方式。
re模块的函数可以分为三个大类:模式匹配、替换以及拆分。
import re
text = "foo bar\t baz \tqux"
#描述一个或多个空白符的regex是\s+
re.split('\s+',text)
['foo', 'bar', 'baz', 'qux']
#如果打算对许多字符串应用同一条正则表达式
regex = repile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
regex.findall(text)
[' ', '\t ', ' \t']
findall返回的是字符串中所有的匹配项,而search则只返回第一个匹配项,match只匹配字符串的首部。
text = """Dave dave@google
Steve steve@gmail
Rob rob@gmail
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
#re.IGNORECASE的作用是使正则表达式对大小写不敏感
regex = repile(pattern,flags=re.IGNORECASE)
regex.findall(text)
['dave@google', 'steve@gmail', 'rob@gmail']
m = regex.search(text)
m
<re.Match object; span=(5, 20), match='dave@google'>
text[m.start():m.end()]
'dave@google'
print(regex.match(text))
None
#sub会将匹配到的模式替换为指定字符串,并返回所得到的新字符串。
print(regex.sub('REDACTED',text))
Dave REDACTED
Steve REDACTED
Rob REDACTED
#将地址分成3个部分:用户名、域名以及域后缀。只需将待分段的模式的各部分用圆括号包起来即可。
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = repile(pattern,flags=re.IGNORECASE)
m = regex.match('wesm@bright')
m.groups()
('wesm', 'bright', 'net')
regex.findall(text)
[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com')]
#sub还能通过诸如\1、\2之类的特殊符号访问各匹配项中的分组
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3',text))
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
regex = repile(r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""",flags=re.IGNORECASE|re.VERBOSE)
m = regex.match('wesm@bright')
m.groupdict()
{'username': 'wesm', 'domain': 'bright', 'suffix': 'net'}
pandas中矢量化的字符串函数
data = {'Dave':'dave@google','Steve':'steve@gmail','Rob':'rob@gmail','Wes':np.nan}
data = Series(data)
data
Dave dave@google
Steve steve@gmail
Rob rob@gmail
Wes NaN
dtype: object
data.isnull()
Dave False
Steve False
Rob False
Wes True
dtype: bool
data.str.contains('gmail')
Dave False
Steve True
Rob True
Wes NaN
dtype: object
pattern
'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
#data.str.findall(pattern,flags=re.IGNORECASE)
x = data.str.findall(pattern,flags=re.IGNORECASE)
x
Dave [(dave, google, com)]
Steve [(steve, gmail, com)]
Rob [(rob, gmail, com)]
Wes NaN
dtype: object
matches = data.str.match(pattern,flags=re.IGNORECASE)
matches
Dave True
Steve True
Rob True
Wes NaN
dtype: object
x.str.get(1)
Dave NaN
Steve NaN
Rob NaN
Wes NaN
dtype: float64
x.str[0]
Dave (dave, google, com)
Steve (steve, gmail, com)
Rob (rob, gmail, com)
Wes NaN
dtype: object
#对字符串进行子串截取
data.str[:5]
Dave dave@
Steve steve
Rob rob@g
Wes NaN
dtype: object
示例:USDA食品数据库
import json
db = json.load(open("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\usda_food\\database.json"))
len(db)
6636
#db中每个条目都是一个含有某种食物全部数据的字典
db[0].keys()
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
db[0]['nutrients'][0]
{'value': 25.18,'units': 'g','description': 'Protein','group': 'Composition'}
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]
| value | units | description | group |
---|
0 | 25.18 | g | Protein | Composition |
---|
1 | 29.20 | g | Total lipid (fat) | Composition |
---|
2 | 3.06 | g | Carbohydrate, by difference | Composition |
---|
3 | 3.28 | g | Ash | Other |
---|
4 | 376.00 | kcal | Energy | Energy |
---|
5 | 39.28 | g | Water | Composition |
---|
6 | 1573.00 | kJ | Energy | Energy |
---|
info_keys = ['description','group','id','manufacturer']
info = DataFrame(db,columns=info_keys)
info[:5]
| description | group | id | manufacturer |
---|
0 | Cheese, caraway | Dairy and Egg Products | 1008 | |
---|
1 | Cheese, cheddar | Dairy and Egg Products | 1009 | |
---|
2 | Cheese, edam | Dairy and Egg Products | 1018 | |
---|
3 | Cheese, feta | Dairy and Egg Products | 1019 | |
---|
4 | Cheese, mozzarella, part skim milk | Dairy and Egg Products | 1028 | |
---|
info
| description | group | id | manufacturer |
---|
0 | Cheese, caraway | Dairy and Egg Products | 1008 | |
---|
1 | Cheese, cheddar | Dairy and Egg Products | 1009 | |
---|
2 | Cheese, edam | Dairy and Egg Products | 1018 | |
---|
3 | Cheese, feta | Dairy and Egg Products | 1019 | |
---|
4 | Cheese, mozzarella, part skim milk | Dairy and Egg Products | 1028 | |
---|
... | ... | ... | ... | ... |
---|
6631 | Bologna, beef, low fat | Sausages and Luncheon Meats | 42161 | |
---|
6632 | Turkey and pork sausage, fresh, bulk, patty or... | Sausages and Luncheon Meats | 42173 | |
---|
6633 | Babyfood, juice, pear | Baby Foods | 43408 | None |
---|
6634 | Babyfood, dessert, banana yogurt, strained | Baby Foods | 43539 | None |
---|
6635 | Babyfood, banana no tapioca, strained | Baby Foods | 43546 | None |
---|
6636 rows × 4 columns
pd.value_counts(info.group)[:10]
Vegetables and Vegetable Products 812
Beef Products 618
Baked Products 496
Breakfast Cereals 403
Legumes and Legume Products 365
Fast Foods 365
Lamb, Veal, and Game Products 345
Sweets 341
Fruits and Fruit Juices 328
Pork Products 328
Name: group, dtype: int64
#要对全部营养数据做一些分析
nutrients = []
for rec in db:fnuts = DataFrame(rec['nutrients'])fnuts['id'] = rec['id']nutrients.append(fnuts)
nutrients = pd.concat(nutrients,ignore_index=True)
nutrients
| value | units | description | group | id |
---|
0 | 25.180 | g | Protein | Composition | 1008 |
---|
1 | 29.200 | g | Total lipid (fat) | Composition | 1008 |
---|
2 | 3.060 | g | Carbohydrate, by difference | Composition | 1008 |
---|
3 | 3.280 | g | Ash | Other | 1008 |
---|
4 | 376.000 | kcal | Energy | Energy | 1008 |
---|
... | ... | ... | ... | ... | ... |
---|
389350 | 0.000 | mcg | Vitamin B-12, added | Vitamins | 43546 |
---|
389351 | 0.000 | mg | Cholesterol | Other | 43546 |
---|
389352 | 0.072 | g | Fatty acids, total saturated | Other | 43546 |
---|
389353 | 0.028 | g | Fatty acids, total monounsaturated | Other | 43546 |
---|
389354 | 0.041 | g | Fatty acids, total polyunsaturated | Other | 43546 |
---|
389355 rows × 5 columns
#丢弃重复项
nutrients.duplicated().sum()
14179
nutrients = nutrients.drop_duplicates()
col_mapping = {'description':'food','group':'fgroup'}
info = info.rename(columns=col_mapping,copy=False)
info
| food | fgroup | id | manufacturer |
---|
0 | Cheese, caraway | Dairy and Egg Products | 1008 | |
---|
1 | Cheese, cheddar | Dairy and Egg Products | 1009 | |
---|
2 | Cheese, edam | Dairy and Egg Products | 1018 | |
---|
3 | Cheese, feta | Dairy and Egg Products | 1019 | |
---|
4 | Cheese, mozzarella, part skim milk | Dairy and Egg Products | 1028 | |
---|
... | ... | ... | ... | ... |
---|
6631 | Bologna, beef, low fat | Sausages and Luncheon Meats | 42161 | |
---|
6632 | Turkey and pork sausage, fresh, bulk, patty or... | Sausages and Luncheon Meats | 42173 | |
---|
6633 | Babyfood, juice, pear | Baby Foods | 43408 | None |
---|
6634 | Babyfood, dessert, banana yogurt, strained | Baby Foods | 43539 | None |
---|
6635 | Babyfood, banana no tapioca, strained | Baby Foods | 43546 | None |
---|
6636 rows × 4 columns
col_mapping = {'description':'nutrient','group':'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping,copy=False)
nutrients
| value | units | nutrient | nutgroup | id |
---|
0 | 25.180 | g | Protein | Composition | 1008 |
---|
1 | 29.200 | g | Total lipid (fat) | Composition | 1008 |
---|
2 | 3.060 | g | Carbohydrate, by difference | Composition | 1008 |
---|
3 | 3.280 | g | Ash | Other | 1008 |
---|
4 | 376.000 | kcal | Energy | Energy | 1008 |
---|
... | ... | ... | ... | ... | ... |
---|
389350 | 0.000 | mcg | Vitamin B-12, added | Vitamins | 43546 |
---|
389351 | 0.000 | mg | Cholesterol | Other | 43546 |
---|
389352 | 0.072 | g | Fatty acids, total saturated | Other | 43546 |
---|
389353 | 0.028 | g | Fatty acids, total monounsaturated | Other | 43546 |
---|
389354 | 0.041 | g | Fatty acids, total polyunsaturated | Other | 43546 |
---|
375176 rows × 5 columns
#将info跟nutrients合并起来
ndata = pd.merge(nutrients,info,on='id',how='outer')
ndata
| value | units | nutrient | nutgroup | id | food | fgroup | manufacturer |
---|
0 | 25.180 | g | Protein | Composition | 1008 | Cheese, caraway | Dairy and Egg Products | |
---|
1 | 29.200 | g | Total lipid (fat) | Composition | 1008 | Cheese, caraway | Dairy and Egg Products | |
---|
2 | 3.060 | g | Carbohydrate, by difference | Composition | 1008 | Cheese, caraway | Dairy and Egg Products | |
---|
3 | 3.280 | g | Ash | Other | 1008 | Cheese, caraway | Dairy and Egg Products | |
---|
4 | 376.000 | kcal | Energy | Energy | 1008 | Cheese, caraway | Dairy and Egg Products | |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
375171 | 0.000 | mcg | Vitamin B-12, added | Vitamins | 43546 | Babyfood, banana no tapioca, strained | Baby Foods | None |
---|
375172 | 0.000 | mg | Cholesterol | Other | 43546 | Babyfood, banana no tapioca, strained | Baby Foods | None |
---|
375173 | 0.072 | g | Fatty acids, total saturated | Other | 43546 | Babyfood, banana no tapioca, strained | Baby Foods | None |
---|
375174 | 0.028 | g | Fatty acids, total monounsaturated | Other | 43546 | Babyfood, banana no tapioca, strained | Baby Foods | None |
---|
375175 | 0.041 | g | Fatty acids, total polyunsaturated | Other | 43546 | Babyfood, banana no tapioca, strained | Baby Foods | None |
---|
375176 rows × 8 columns
result = ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].sort_values().plot(kind='barh')
by_nutrient = ndata.groupby(['nutgroup','nutrient'])
get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())
max_foods = by_nutrient.apply(get_maximum)[['value','food']]
#让food小一点
max_foods.food = max_foods.food.str[:50]
max_foods.loc['Amino Acids']['food']
nutrient
Alanine Gelatins, dry powder, unsweetened
Arginine Seeds, sesame flour, low-fat
Aspartic acid Soy protein isolate
Cystine Seeds, cottonseed flour, low fat (glandless)
Glutamic acid Soy protein isolate
Glycine Gelatins, dry powder, unsweetened
Histidine Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine Fish, cod, Atlantic, dried and salted
Phenylalanine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline Gelatins, dry powder, unsweetened
Serine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object
发布评论