DataFrame是一个类似表格的数据结构,索引包括列索引和行索引,包含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔值等)。DataFrame的每一行和每一列都是一个Series,这个Series的name属性为当前的行索引名/列索引名。
使用字典生成DataFrame
#使用字典生成DataFrame
from pandas import DataFrame
data = DataFrame({'state':['ok', 'ok', 'good', 'bad'],
'year':[2000, 2001, 2002, 2003],
'pop':[3.7, 3.6, 2.4, 0.9]})
print (data )# 行索引index默认为0,1,2,3
state year pop
0 ok 2000 3.7
1 ok 2001 3.6
2 good 2002 2.4
3 bad 2003 0.9
#指定列索引columns,不匹配的列为NaN
print (DataFrame(data, columns = ['year', 'state', 'pop','debt']))
year state pop debt
0 2000 ok 3.7 NaN
1 2001 ok 3.6 NaN
2 2002 good 2.4 NaN
3 2003 bad 0.9 NaN
#指定行索引index
x = DataFrame(data,
columns = ['year', 'state', 'pop', 'debt'],
index = ['one', 'two', 'three', 'four'])
print(x)
year state pop debt
one 2000 ok 3.7 NaN
two 2001 ok 3.6 NaN
three 2002 good 2.4 NaN
four 2003 bad 0.9 NaN
#按列访问
print(DataFrame(data)['state'])
0 ok
1 ok
2 good
3 bad
Name: state, dtype: object
DataFrame元素的索引与修改
#原数据框
year state pop debt
one 2000 ok 3.7 NaN
two 2001 ok 3.6 NaN
three 2002 good 2.4 NaN
four 2003 bad 0.9 NaN
import numpy
print(x['state'])
one ok
two ok
three good
four bad
Name: state, dtype: object
# 修改一整列数据
x['debt'] = 16.5
print(x)
year state pop debt
one 2000 ok 3.7 16.5
two 2001 ok 3.6 16.5
three 2002 good 2.4 16.5
four 2003 bad 0.9 16.5
# 用numpy数组修改元素
x.debt = numpy.arange(4)
print(x)
year state pop debt
one 2000 ok 3.7 0
two 2001 ok 3.6 1
three 2002 good 2.4 2
four 2003 bad 0.9 3
#用Series修改元素,没有指定的默认数据用NaN
val = Series([-1.2, -1.5, -1.7,0], index = ['one', 'two', 'five','six'])
x.debt = val # DataFrame的行索引不变
print(x)
year state pop debt
one 2000 ok 3.7 -1.2
two 2001 ok 3.6 -1.5
three 2002 good 2.4 NaN
four 2003 bad 0.9 NaN
#增加一行
x.loc[len(x)]=[2,3,4,5]
print(x)
year state pop debt
one 2000 ok 3.7 -1.2
two 2001 ok 3.6 -1.5
three 2002 good 2.4 NaN
four 2003 bad 0.9 NaN
4 2 3 4.0 5.0
#增加一列
x['newColumn']=[1,1,1,1,1]
print(x)
year state pop debt newColumn
one 2000 ok 3.7 -1.2 1
two 2001 ok 3.6 -1.5 1
three 2002 good 2.4 NaN 1
four 2003 bad 0.9 NaN 1
4 2 3 4.0 5.0 1
#DataFrame转置
print(x.T)
one two three four 4
yeat NaN NaN NaN NaN 2
state ok ok good bad 3
pop 3.7 3.6 2.4 0.9 4
debt -1.2 -1.5 NaN NaN 5
newColumn 1 1 1 1 1
DataFrame算术:不重叠部分为NaN,重叠部分元素运算
x = DataFrame(numpy.arange(9.).reshape((3, 3)),
columns = ['A','B','C'],
index = ['a', 'b', 'c'])
y = DataFrame(numpy.arange(12).reshape((4, 3)),
columns = ['A','B','C'],
index = ['a', 'b', 'c', 'd'])
print(x)
A B C
a 0.0 1.0 2.0
b 3.0 4.0 5.0
c 6.0 7.0 8.0
print(y)
A B C
a 0 1 2
b 3 4 5
c 6 7 8
d 9 10 11
print(x+y)
A B C
a 0.0 2.0 4.0
b 6.0 8.0 10.0
c 12.0 14.0 16.0
d NaN NaN NaN
#DataFrame与Series运算:每行/列进行运算
frame = DataFrame(numpy.arange(9).reshape((3, 3)),
columns = ['A','B','C'],
index = ['a', 'b', 'c'])
print(frame)
A B C
a 0 1 2
b 3 4 5
c 6 7 8
series=frame.ix[0]
print(series)
A 0
B 1
C 2
print(frame-series)
A B C
a 0 0 0
b 3 3 3
c 6 6 6
# 按行运算:缺失列则为NaN
series2 = Series(range(4), index = ['A','B','C','D'])
print(series2)
A 0
B 1
C 2
D 3
print (frame + series2 )
A B C D
a 0 2 4 NaN
b 3 5 7 NaN
c 6 8 10 NaN
series3 = frame.A
print(series3)
a 0
b 3
c 6
Name: A, dtype: int64
print(frame.sub(series3,axis=0))#替换
A B C
a 0 1 2
b 0 1 2
c 0 1 2
额外运算
df = DataFrame({
'column1': numpy.random.randn(5),
'column2': numpy.random.randn(5)
})
print(df)
column1 column2
0 -0.336839 -0.420312
1 -1.172474 0.671025
2 -0.481245 0.292897
3 1.335457 -1.167297
4 -0.170178 0.140632
#每列最小
print(df.apply(min))
column1 -1.172474
column2 -1.167297
dtype: float64
#每行最小
print(df.apply(min, axis=1))
0 -0.420312
1 -1.172474
2 -0.481245
3 -1.167297
4 -0.170178
dtype: float64
#判断每个列,值是否都大于0
print(df.apply(
lambda x: numpy.all(x>0),
axis=1
))
0 False
1 False
2 False
3 False
4 False
dtype: bool
print(DataFrame(df[df.apply(
lambda x: numpy.all(x>0),
axis=1
)]))
column1 column2
3 0.826535 0.415204