import
In[1]: import numpy as np
In[2]: import pandas as pd
In[3]: import matplotlib.pyplot as plt
创建对象
通过list创建Series,pandas创建默认的整数索引
In[4]: s = pd.Series([1,3,5,np.nan,6,8]
In[5]: s
Out[5]:
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
通过numpy array创建DataFrame,以datetime做为索引,并赋以列标签
In[6]: dates = pd.date_range('20130101', periods=6)
In[7]: dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
In[8]: df
A B C D
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024
2013-01-02 1.247014 0.144953 -0.261148 -1.134928
2013-01-03 0.054880 0.472487 0.325529 1.444452
2013-01-04 2.060033 -0.365392 -1.565857 -0.629383
2013-01-05 1.131430 1.194570 -0.295539 -1.198887
2013-01-06 -1.027478 1.643327 0.328694 -0.321054
通过可以转换为Series的字典创建DateFrame
In[9]: df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
In[10]: df2
预览数据
查看数据开始或结尾的n行数据
In[11]: df.head(2)
A B C D
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024
2013-01-02 1.247014 0.144953 -0.261148 -1.134928
In[12]: df.tail(2)
A B C D
2013-01-05 1.131430 1.194570 -0.295539 -1.198887
2013-01-06 -1.027478 1.643327 0.328694 -0.321054
查看数据索引、列标签和数据
In[13]: df.index
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In[14]: df.columns
Index([u'A', u'B', u'C', u'D'], dtype='object')
In[15]: df.values
[[-0.46376373 -0.12927311 -0.59335418 -0.56702388]
[ 1.24701447 0.1449526 -0.26114762 -1.13492832]
[ 0.05488003 0.47248704 0.3255288 1.4444521 ]
[ 2.06003285 -0.36539171 -1.56585717 -0.62938329]
[ 1.13143027 1.19457009 -0.29553921 -1.19888659]
[-1.02747768 1.64332729 0.32869363 -0.32105412]]
数据基本统计信息描述
In[16]: df.describe()
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.500353 0.493445 -0.343613 -0.401137
std 1.170521 0.782441 0.702155 0.966135
min -1.027478 -0.365392 -1.565857 -1.198887
25% -0.334103 -0.060717 -0.518900 -1.008542
50% 0.593155 0.308720 -0.278343 -0.598204
75% 1.218118 1.014049 0.178860 -0.382547
max 2.060033 1.643327 0.328694 1.444452
数据转置
In[17]: df.T
2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A -0.463764 1.247014 0.054880 2.060033 1.131430 -1.027478
B -0.129273 0.144953 0.472487 -0.365392 1.194570 1.643327
C -0.593354 -0.261148 0.325529 -1.565857 -0.295539 0.328694
D -0.567024 -1.134928 1.444452 -0.629383 -1.198887 -0.321054
按照一个轴的索引排序
In[18]: df.sort_index(axis=1, ascending=False)
D C B A
2013-01-01 -0.567024 -0.593354 -0.129273 -0.463764
2013-01-02 -1.134928 -0.261148 0.144953 1.247014
2013-01-03 1.444452 0.325529 0.472487 0.054880
2013-01-04 -0.629383 -1.565857 -0.365392 2.060033
2013-01-05 -1.198887 -0.295539 1.194570 1.131430
2013-01-06 -0.321054 0.328694 1.643327 -1.027478
按照数据排序
In[19]: df.sort_values(by='B')
A B C D
2013-01-06 -1.027478 1.643327 0.328694 -0.321054
2013-01-05 1.131430 1.194570 -0.295539 -1.198887
2013-01-03 0.054880 0.472487 0.325529 1.444452
2013-01-02 1.247014 0.144953 -0.261148 -1.134928
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024
2013-01-04 2.060033 -0.365392 -1.565857 -0.629383
数据选择
使用panda过程中推荐使用优化过的pandas数据索引方式:at,iat,loc,iloc,ix
选择单独一列,得到一个Series数据结构,df['A'] 等同于df.A
In[20]: df["A"]
2013-01-01 -0.463764
2013-01-02 1.247014
2013-01-03 0.054880
2013-01-04 2.060033
2013-01-05 1.131430
2013-01-06 -1.027478
Freq: D, Name: A, dtype: float64
通过行切片选择
In[21]: df[0:3]
A B C D
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024
2013-01-02 1.247014 0.144953 -0.261148 -1.134928
2013-01-03 0.054880 0.472487 0.325529 1.444452
In[22]: df["2013-01-01":"2013-01-03"]
A B C D
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024
2013-01-02 1.247014 0.144953 -0.261148 -1.134928
2013-01-03 0.054880 0.472487 0.325529 1.444452
通过标签选择
In[23]: print df.loc[dates[0]]
A -0.463764
B -0.129273
C -0.593354
D -0.567024
Name: 2013-01-01 00:00:00, dtype: float64
通过多个轴标签进行选择
In[24]: df.loc[:,['A','B']]
A B
2013-01-01 -0.463764 -0.129273
2013-01-02 1.247014 0.144953
2013-01-03 0.054880 0.472487
2013-01-04 2.060033 -0.365392
2013-01-05 1.131430 1.194570
2013-01-06 -1.027478 1.643327
数据包含了endpoint
In[25]: df.loc['20130102':'20130104',['A','B']]
A B
2013-01-02 1.247014 0.144953
2013-01-03 0.054880 0.472487
2013-01-04 2.060033 -0.365392
降低返回对象的维度
In[26]: df.loc['20130102',['A','B']]
A 1.247014
B 0.144953
Name: 2013-01-02 00:00:00, dtype: float64
得到一个标量值
In[27]: df.loc[dates[0],'A']
-0.46376373
快速获取一个标量值
In[28]: df.at[dates[0],'A']
-0.46376373
通过位置进行选择
通过整数位置
In[29]: df.iloc[3]
A 2.060033
B -0.365392
C -1.565857
D -0.629383
Name: 2013-01-04 00:00:00, dtype: float64
通过整数切片
In[30]: df.iloc[3:5,0:2]
A B
2013-01-04 2.060033 -0.365392
2013-01-05 1.131430 1.194570
通过整数位置列表
In[31]: df.iloc[[1,2,4],[0,2]]
A C
2013-01-02 1.247014 -0.261148
2013-01-03 0.054880 0.325529
2013-01-05 1.131430 -0.295539
所有行,指定列
In[32]: df.iloc[:,1:3]
B C
2013-01-01 -0.129273 -0.593354
2013-01-02 0.144953 -0.261148
2013-01-03 0.472487 0.325529
2013-01-04 -0.365392 -1.565857
2013-01-05 1.194570 -0.295539
2013-01-06 1.643327 0.328694
所有列,指定行
In[33]: df.iloc[1:3,:]
A B C D
2013-01-02 1.247014 0.144953 -0.261148 -1.134928
2013-01-03 0.054880 0.472487 0.325529 1.444452
获取指定位置的值
In[34]: df.iat[1,1]
0.1449526
通过布尔进行选择
通过单独列的值选择数据
In[35]: df[df.A > 0]
A B C D
2013-01-02 1.247014 0.144953 -0.261148 -1.134928
2013-01-03 0.054880 0.472487 0.325529 1.444452
2013-01-04 2.060033 -0.365392 -1.565857 -0.629383
2013-01-05 1.131430 1.194570 -0.295539 -1.198887
选择DataFrame中满足boolean条件的值
In[36]: df[df > 0]
A B C D
2013-01-01 NaN NaN NaN NaN
2013-01-02 1.247014 0.144953 NaN NaN
2013-01-03 0.054880 0.472487 0.325529 1.444452
2013-01-04 2.060033 NaN NaN NaN
2013-01-05 1.131430 1.194570 NaN NaN
2013-01-06 NaN 1.643327 0.328694 NaN
通过isin()方法过滤
In[37]: df2[df2['E'].isin(['two','four'])]
A B C D E
2013-01-03 0.05488 0.472487 0.325529 1.444452 two
2013-01-05 1.13143 1.194570 -0.295539 -1.198887 four
赋值和更新
缺失数据
pandas用np.nan表示缺失数据
reindexing允许add/delete/change给定轴,指定索引的数据,并返回数据的copy
In[38]: df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
In[39]: df1.loc[dates[0]:dates[1],'E'] = 1
In[40]: df1
A B C D E
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024 1.0
2013-01-02 1.247014 0.144953 -0.261148 -1.134928 1.0
2013-01-03 0.054880 0.472487 0.325529 1.444452 NaN
2013-01-04 2.060033 -0.365392 -1.565857 -0.629383 NaN
删除所有包含缺失数据的行
In[41]: df1.dropna(how='any')
A B C D E
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024 1.0
2013-01-02 1.247014 0.144953 -0.261148 -1.134928 1.0
填充缺失数据
In[42]: df1.fillna(value=5)
A B C D E
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024 1.0
2013-01-02 1.247014 0.144953 -0.261148 -1.134928 1.0
2013-01-03 0.054880 0.472487 0.325529 1.444452 5.0
2013-01-04 2.060033 -0.365392 -1.565857 -0.629383 5.0
获取缺失数据掩码
In[43]: pd.isnull(df1)
A B C D E
2013-01-01 False False False False False
2013-01-02 False False False False False
2013-01-03 False False False False True
2013-01-04 False False False False True
operations
- Apply
In[44]: df.apply(np.cumsum)
A B C D
2013-01-01 -0.463764 -0.129273 -0.593354 -0.567024
2013-01-02 0.783251 0.015679 -0.854502 -1.701952
2013-01-03 0.838131 0.488167 -0.528973 -0.257500
2013-01-04 2.898164 0.122775 -2.094830 -0.886883
2013-01-05 4.029594 1.317345 -2.390369 -2.085770
2013-01-06 3.002116 2.960672 -2.061676 -2.406824
In[45]: df.apply(lambda x: x.max() - x.min())
A 3.087511
B 2.008719
C 1.894551
D 2.643339
dtype: float64
- 直方图
In[46]: s = pd.Series(np.random.randint(0, 7, size=10))
In[47]: s
0 1
1 1
2 6
3 4
4 6
5 5
6 4
7 2
8 1
9 3
dtype: int64
In[48]: s.value_counts()
1 3
6 2
4 2
5 1
3 1
2 1
dtype: int64
- 字符串操作
Series在str属性中封装了一系列的字符串操作方法,
In[49]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
In[50]: s.str.lower()
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
合并(Merge)
- concat
通过concat实现pandas对象的连接
In[51]: df = pd.DataFrame(np.random.randn(10, 4))
In[52]: pieces = [df[:3], df[3:7], df[7:]]
In[53]: pd.concat(pieces)
0 1 2 3
0 -1.305349 1.278290 -0.349311 -0.185377
1 -0.260212 1.732672 1.065511 1.231004
2 1.387606 0.411907 0.294163 -0.155383
3 0.348812 1.473364 -0.702673 0.012942
4 0.658293 -0.219758 1.668671 -1.047261
5 -0.734177 0.554792 1.383507 0.458022
6 -0.572618 -1.438856 -0.795051 0.973842
7 -0.225589 -1.799563 1.035444 -1.946946
8 -0.380544 -0.646450 -0.009933 -0.250325
9 -1.486679 -1.636571 0.597604 -0.410820
- join
SQL风格的数据合并
In[54]: left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
In[55]: right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
In[56]: left
key lval
0 foo 1
1 foo 2
In[57]: right
key rval
0 foo 4
1 foo 5
In[58]: pd.merge(left,right,on='key')
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
另外一个join的示例
In[59]: left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
In[60]: right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
In[61]: left
key lval
0 foo 1
1 bar 2
In[62]: right
key rval
0 foo 4
1 bar 5
In[63]: pd.merge(left, right, on='key')
key lval rval
0 foo 1 4
1 bar 2 5
- 追加(Append)
向dataframe中添加一行数据
In[64]: df =pd.DataFrame(np.random.randn(8,4),columns=['A','B','C','D'])
In[65]: df
A B C D
0 -0.908815 0.183665 0.114356 -0.138478
1 -1.152427 -0.784012 0.366452 -0.981429
2 -0.343616 -0.923369 -0.322267 -0.644783
3 -0.813925 0.656906 1.716464 -0.226989
4 0.641228 -1.245245 -0.339729 0.448473
5 -1.026384 -2.224948 -0.064727 2.480351
6 1.350869 0.599377 -0.070411 1.490076
7 0.832065 -1.882072 1.484589 -0.275415
In[66]: s = df.iloc[3]
In[67]: df.append(s,ignore_index=True)
A B C D
0 -0.908815 0.183665 0.114356 -0.138478
1 -1.152427 -0.784012 0.366452 -0.981429
2 -0.343616 -0.923369 -0.322267 -0.644783
3 -0.813925 0.656906 1.716464 -0.226989
4 0.641228 -1.245245 -0.339729 0.448473
5 -1.026384 -2.224948 -0.064727 2.480351
6 1.350869 0.599377 -0.070411 1.490076
7 0.832065 -1.882072 1.484589 -0.275415
8 -0.813925 0.656906 1.716464 -0.226989
Grouping
Grouping使用步骤:
- Splitting :基于一些标准把数据分成组
- Applying:分别对每个组应用操作函数
- Combining:将结果构建成一个数据结构
In[68]: df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],'C' : np.random.randn(8),'D' : np.random.randn(8)})
In[69]: df
A B C D
0 foo one -1.202872 -0.055224
1 bar one -1.814470 2.395985
2 foo two 1.018601 1.552825
3 bar three -0.595447 0.166599
4 foo two 1.395433 0.047609
5 bar two -0.392670 -0.136473
6 foo one 0.007207 -0.561757
7 foo three 1.928123 -1.623033
分组,然后对结果组应用和函数
In[70]: df.groupby('A').sum()
C D
A
bar 4.604649 2.508464
foo 0.600323 1.540229
按多个列分组形成一个分层索引,然后应用和函数。
In[71]: df.groupby(['A','B']).sum()
C D
A B
bar one 2.401016 1.031469
three 1.249291 0.191503
two 0.954343 1.285492
foo one 1.036652 1.632488
three 1.099448 0.161951
two -1.535778 -0.254209
数据reshape
- stack
In[72]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz','foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two','one', 'two', 'one', 'two']]))
In[73]: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
In[74]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
In[75]: df2 = df[:4]
In[76]: df2
A B
first second
bar one 0.029399 -0.542108
two 0.282696 -0.087302
baz one -1.575170 1.771208
two 0.816482 1.100230
stack()操作将DataFrame “compress”到同一列
In[77]: stacked = df2.stack()
In[78]: stacked
first second
bar one A 2.812147
B -0.638122
two A -2.008341
B -2.214172
baz one A 0.454807
B -1.130973
two A -0.343961
B -0.188025
dtype: float64
unstack(),stack的逆操作
In[79]: stacked.unstack()
A B
first second
bar one 2.812147 -0.638122
two -2.008341 -2.214172
baz one 0.454807 -1.130973
two -0.343961 -0.188025
时间序列
pandas具有简单、强大和高效的时间处理函数,用于在频率转换时执行重采样操作。
In[80]: rng = pd.date_range('1/1/2012', periods=100, freq='S')
In[81]: ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
In[82]: ts = ts.resample('5Min').sum()
2012-01-01 25353
Freq: 5T, dtype: int64
时区表示
In[83]: rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
In[84]: ts = pd.Series(np.random.randn(len(rng)),index=rng)
In[85]: ts
2012-03-06 -0.182390
2012-03-07 0.701854
2012-03-08 -0.306677
2012-03-09 0.264777
2012-03-10 0.137497
Freq: D, dtype: float64
In[86]: tz = ts.tz_localize('UTC')
In[87]: tz
2012-03-06 00:00:00+00:00 -0.182390
2012-03-07 00:00:00+00:00 0.701854
2012-03-08 00:00:00+00:00 -0.306677
2012-03-09 00:00:00+00:00 0.264777
2012-03-10 00:00:00+00:00 0.137497
Freq: D, dtype: float64
转换成其它时区
In[88]: tz.tz_convert('US/Eastern')
2012-03-05 19:00:00-05:00 -0.182390
2012-03-06 19:00:00-05:00 0.701854
2012-03-07 19:00:00-05:00 -0.306677
2012-03-08 19:00:00-05:00 0.264777
2012-03-09 19:00:00-05:00 0.137497
Freq: D, dtype: float64
时间跨度转换
In[89]: rng = pd.date_range('1/1/2012', periods=5, freq='M')
In[90]: ts = pd.Series(np.random.randn(len(rng)), index=rng)
In[91]: ts
2012-01-01 2.195708
2012-01-02 -0.688733
2012-01-03 -1.197042
2012-01-04 -0.706386
2012-01-05 1.047988
Freq: D, dtype: float64
In[92]: ps = ts.to_period()
In[93]: ps
2012-01-01 2.195708
2012-01-02 -0.688733
2012-01-03 -1.197042
2012-01-04 -0.706386
2012-01-05 1.047988
Freq: D, dtype: float64
In[94]: ps.to_timestamp()
2012-01-01 2.195708
2012-01-02 -0.688733
2012-01-03 -1.197042
2012-01-04 -0.706386
2012-01-05 1.047988
Freq: D, dtype: float64
数据展示
In[95]: ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
In[96]: ts = ts.cumsum()
In[97]: ts.plot()
In[98]: plt.show()
在DataFrame中,plot是一个便利的绘图工具
In[99]: df = pd.DataFrame(np.random.randn(1000, 4), In[99]: In[100]: index=ts.index,columns=['A', 'B', 'C', 'D'])
In[101]: df = df.cumsum()
In[102]: plt.figure(); df.plot(); plt.legend(loc='best')
In[103]: plt.show()
数据的写入写出
读取CSV文件
In[104]: pd.read_csv('foo.csv')
写出CSV文件
In[105]: df.to_csv('foo.csv')