十九、数据整理(6)
作者:Chris Albon
译者:飞龙
协议:CC BY-NC-SA 4.0
在列中搜索某个值
# 导入模块
import pandas as pd
raw_data = {'first_name': ['Jason', 'Jason', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Miller', 'Ali', 'Milner', 'Cooze'],
'age': [42, 42, 36, 24, 73],
'preTestScore': [4, 4, 31, 2, 3],
'postTestScore': [25, 25, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Jason |
Miller |
42 |
4 |
25 |
2 |
Tina |
Ali |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
2 |
62 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
# 在列中寻找值在哪里
# 查看 postTestscore 大于 50 的地方
df['preTestScore'].where(df['postTestScore'] > 50)
'''
0 NaN
1 NaN
2 31.0
3 2.0
4 3.0
Name: preTestScore, dtype: float64
'''
选择包含特定值的行和列
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd.set_option('display.max_row', 1000)
# 设置 ipython 的最大列宽
pd.set_option('display.max_columns', 50)
# 创建示例数据帧
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
# 按照列值抓取行
value_list = ['Tina', 'Molly', 'Jason']
df[df.name.isin(value_list)]
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
# 获取列值不是某个值的行
df[~df.name.isin(value_list)]
|
name |
reports |
year |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
选择具有特定值的行
import pandas as pd
# 创建示例数据帧
data = {'name': ['Jason', 'Molly'],
'country': [['Syria', 'Lebanon'],['Spain', 'Morocco']]}
df = pd.DataFrame(data)
df
|
country |
name |
0 |
[Syria, Lebanon] |
Jason |
1 |
[Spain, Morocco] |
Molly |
df[df['country'].map(lambda country: 'Syria' in country)]
|
country |
name |
0 |
[Syria, Lebanon] |
Jason |
使用多个过滤器选择行
import pandas as pd
# 创建示例数据帧
data = {'name': ['A', 'B', 'C', 'D', 'E'],
'score': [1,2,3,4,5]}
df = pd.DataFrame(data)
df
|
name |
score |
0 |
A |
1 |
1 |
B |
2 |
2 |
C |
3 |
3 |
D |
4 |
4 |
E |
5 |
# 选择数据帧的行,其中 df.score 大于 1 且小于 5
df[(df['score'] > 1) & (df['score'] < 5)]
|
name |
score |
1 |
B |
2 |
2 |
C |
3 |
3 |
D |
4 |
根据条件选择数据帧的行
# 导入模块
import pandas as pd
import numpy as np
# 创建数据帧
raw_data = {'first_name': ['Jason', 'Molly', np.nan, np.nan, np.nan],
'nationality': ['USA', 'USA', 'France', 'UK', 'UK'],
'age': [42, 52, 36, 24, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'nationality', 'age'])
df
|
first_name |
nationality |
age |
0 |
Jason |
USA |
42 |
1 |
Molly |
USA |
52 |
2 |
NaN |
France |
36 |
3 |
NaN |
UK |
24 |
4 |
NaN |
UK |
70 |
# 方法 1:使用布尔变量
# 如果国籍是美国,则变量为 TRUE
american = df['nationality'] == "USA"
# 如果年龄大于 50,则变量为 TRUE
elderly = df['age'] > 50
# 选择所有国籍为美国且年龄大于 50 的案例
df[american & elderly]
|
first_name |
nationality |
age |
1 |
Molly |
USA |
52 |
# 方法 2:使用变量属性
# 选择所有不缺少名字且国籍为美国的案例
df[df['first_name'].notnull() & (df['nationality'] == "USA")]
|
first_name |
nationality |
age |
0 |
Jason |
USA |
42 |
1 |
Molly |
USA |
52 |
数据帧简单示例
# 导入模块
import pandas as pd
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25 |
1 |
Molly |
Jacobson |
52 |
24 |
94 |
2 |
Tina |
Ali |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
2 |
62 |
4 |
Amy |
Cooze |
73 |
3 |
70 |
# 创建第二个数据帧
raw_data_2 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'],
'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'],
'age': [53, 26, 72, 73, 24],
'preTestScore': [13, 52, 72, 26, 26],
'postTestScore': [82, 52, 56, 234, 254]}
df_2 = pd.DataFrame(raw_data_2, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df_2
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Sarah |
Mornig |
53 |
13 |
82 |
1 |
Gueniva |
Jaker |
26 |
52 |
52 |
2 |
Know |
Alom |
72 |
72 |
56 |
3 |
Sara |
Ormon |
73 |
26 |
234 |
4 |
Cat |
Koozer |
24 |
26 |
254 |
# 创建第三个数据帧
raw_data_3 = {'first_name': ['Sarah', 'Gueniva', 'Know', 'Sara', 'Cat'],
'last_name': ['Mornig', 'Jaker', 'Alom', 'Ormon', 'Koozer'],
'postTestScore_2': [82, 52, 56, 234, 254]}
df_3 = pd.DataFrame(raw_data_3, columns = ['first_name', 'last_name', 'postTestScore_2'])
df_3
|
first_name |
last_name |
postTestScore_2 |
0 |
Sarah |
Mornig |
82 |
1 |
Gueniva |
Jaker |
52 |
2 |
Know |
Alom |
56 |
3 |
Sara |
Ormon |
234 |
4 |
Cat |
Koozer |
254 |
排序数据帧的行
# 导入模块
import pandas as pd
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [1, 2, 1, 2, 3],
'coverage': [2, 2, 3, 3, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
coverage |
name |
reports |
year |
Cochice |
2 |
Jason |
1 |
2012 |
Pima |
2 |
Molly |
2 |
2012 |
Santa Cruz |
3 |
Tina |
1 |
2013 |
Maricopa |
3 |
Jake |
2 |
2014 |
Yuma |
3 |
Amy |
3 |
2014 |
# 按报告对数据框的行降序排序
df.sort_values(by='reports', ascending=0)
|
coverage |
name |
reports |
year |
Yuma |
3 |
Amy |
3 |
2014 |
Pima |
2 |
Molly |
2 |
2012 |
Maricopa |
3 |
Jake |
2 |
2014 |
Cochice |
2 |
Jason |
1 |
2012 |
Santa Cruz |
3 |
Tina |
1 |
2013 |
# 按 coverage 然后是报告对数据帧的行升序排序
df.sort_values(by=['coverage', 'reports'])
|
coverage |
name |
reports |
year |
Cochice |
2 |
Jason |
1 |
2012 |
Pima |
2 |
Molly |
2 |
2012 |
Santa Cruz |
3 |
Tina |
1 |
2013 |
Maricopa |
3 |
Jake |
2 |
2014 |
Yuma |
3 |
Amy |
3 |
2014 |
将经纬度坐标变量拆分为单独的变量
import pandas as pd
import numpy as np
raw_data = {'geo': ['40.0024, -105.4102', '40.0068, -105.266', '39.9318, -105.2813', np.nan]}
df = pd.DataFrame(raw_data, columns = ['geo'])
df
|
geo |
0 |
40.0024, -105.4102 |
1 |
40.0068, -105.266 |
2 |
39.9318, -105.2813 |
3 |
NaN |
--- |
--- |
# 为要放置的循环结果创建两个列表
lat = []
lon = []
# 对于变量中的每一行
for row in df['geo']:
# Try to,
try:
# 用逗号分隔行,转换为浮点
# 并将逗号前的所有内容追加到 lat
lat.append(row.split(',')[0])
# 用逗号分隔行,转换为浮点
# 并将逗号后的所有内容追加到 lon
lon.append(row.split(',')[1])
# 但是如果你得到了错误
except:
# 向 lat 添加缺失值
lat.append(np.NaN)
# 向 lon 添加缺失值
lon.append(np.NaN)
# 从 lat 和 lon 创建新的两列
df['latitude'] = lat
df['longitude'] = lon
df
|
geo |
latitude |
longitude |
0 |
40.0024, -105.4102 |
40.0024 |
-105.4102 |
1 |
40.0068, -105.266 |
40.0068 |
-105.266 |
2 |
39.9318, -105.2813 |
39.9318 |
-105.2813 |
3 |
NaN |
NaN |
NaN |
数据流水线
# 创建一些原始数据
raw_data = [1,2,3,4,5,6,7,8,9,10]
# 定义产生 input+6 的生成器
def add_6(numbers):
for x in numbers:
output = x+6
yield output
# 定义产生 input-2 的生成器
def subtract_2(numbers):
for x in numbers:
output = x-2
yield output
# 定义产生 input*100 的生成器
def multiply_by_100(numbers):
for x in numbers:
output = x*100
yield output
# 流水线的第一步
step1 = add_6(raw_data)
# 流水线的第二步
step2 = subtract_2(step1)
# 流水线的第三步
pipeline = multiply_by_100(step2)
# 原始数据的第一个元素
next(pipeline)
# 500
# 原始数据的第二个元素
next(pipeline)
# 600
# 处理所有数据
for raw_data in pipeline:
print(raw_data)
'''
700
800
900
1000
1100
1200
1300
1400
'''
数据帧中的字符串整理
# 导入模块
import pandas as pd
import numpy as np
import re as re
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'email': ['[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)', np.NAN, '[[email protected]](/cdn-cgi/l/email-protection)', '[[email protected]](/cdn-cgi/l/email-protection)'],
'preTestScore': [4, 24, 31, 2, 3],
'postTestScore': [25, 94, 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'email', 'preTestScore', 'postTestScore'])
df
# 电子邮件列中的哪些字符串包含 'gmail'
df['email'].str.contains('gmail')
'''
0 True
1 True
2 NaN
3 False
4 False
Name: email, dtype: object
'''
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
df['email'].str.findall(pattern, flags=re.IGNORECASE)
'''
0 [(jas203, gmail, com)]
1 [(momomolly, gmail, com)]
2 NaN
3 [(battler, milner, com)]
4 [(Ames1234, yahoo, com)]
Name: email, dtype: object
'''
matches = df['email'].str.match(pattern, flags=re.IGNORECASE)
matches
'''
/Users/chrisralbon/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
if __name__ == '__main__':
0 (jas203, gmail, com)
1 (momomolly, gmail, com)
2 NaN
3 (battler, milner, com)
4 (Ames1234, yahoo, com)
Name: email, dtype: object
'''
matches.str[1]
'''
0 gmail
1 gmail
2 NaN
3 milner
4 yahoo
Name: email, dtype: object
'''
和 Pandas 一起使用列表推导式
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd.set_option('display.max_row', 1000)
# 设置 ipython 的最大列宽
pd.set_option('display.max_columns', 50)
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
作为循环的列表推导式。
# 创建变量
next_year = []
# 对于 df.years 的每一行
for row in df['year']:
# 为这一行添加 1 并将其附加到 next_year
next_year.append(row + 1)
# 创建 df.next_year
df['next_year'] = next_year
# 查看数据帧
df
|
name |
reports |
year |
next_year |
Cochice |
Jason |
4 |
2012 |
2013 |
Pima |
Molly |
24 |
2012 |
2013 |
Santa Cruz |
Tina |
31 |
2013 |
2014 |
Maricopa |
Jake |
2 |
2014 |
2015 |
Yuma |
Amy |
3 |
2014 |
2015 |
作为列表推导式。
# 对于 df.year 中的每一行,从行中减去 1
df['previous_year'] = [row-1 for row in df['year']]
df
|
name |
reports |
year |
next_year |
previous_year |
Cochice |
Jason |
4 |
2012 |
2013 |
2011 |
Pima |
Molly |
24 |
2012 |
2013 |
2011 |
Santa Cruz |
Tina |
31 |
2013 |
2014 |
2012 |
Maricopa |
Jake |
2 |
2014 |
2015 |
2013 |
Yuma |
Amy |
3 |
2014 |
2015 |
2013 |
使用 Seaborn 来可视化数据帧
import pandas as pd
%matplotlib inline
import random
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame()
df['x'] = random.sample(range(1, 100), 25)
df['y'] = random.sample(range(1, 100), 25)
df.head()
|
x |
y |
0 |
18 |
25 |
1 |
42 |
67 |
2 |
52 |
77 |
3 |
4 |
34 |
4 |
14 |
69 |
# 散点图
sns.lmplot('x', 'y', data=df, fit_reg=False)
# <seaborn.axisgrid.FacetGrid at 0x114563b00>
# 密度图
sns.kdeplot(df.y)
# <matplotlib.axes._subplots.AxesSubplot at 0x113ea2ef0>
sns.kdeplot(df.y, df.x)
# <matplotlib.axes._subplots.AxesSubplot at 0x113d7fef0>
sns.distplot(df.x)
# <matplotlib.axes._subplots.AxesSubplot at 0x114294160>
# 直方图
plt.hist(df.x, alpha=.3)
sns.rugplot(df.x);
# 箱形图
sns.boxplot([df.y, df.x])
# <matplotlib.axes._subplots.AxesSubplot at 0x1142b8b38>
# 提琴图
sns.violinplot([df.y, df.x])
# <matplotlib.axes._subplots.AxesSubplot at 0x114444a58>
# 热力图
sns.heatmap([df.y, df.x], annot=True, fmt="d")
# <matplotlib.axes._subplots.AxesSubplot at 0x114530c88>
# 聚类图
sns.clustermap(df)
# <seaborn.matrix.ClusterGrid at 0x116f313c8>
Pandas 数据结构
# 导入模块
import pandas as pd
序列 101
序列是一维数组(类似 R 的向量)。
# 创建 floodingReports 数量的序列
floodingReports = pd.Series([5, 6, 2, 9, 12])
floodingReports
'''
0 5
1 6
2 2
3 9
4 12
dtype: int64
'''
请注意,第一列数字(0 到 4)是索引。
# 将县名设置为 floodingReports 序列的索引
floodingReports = pd.Series([5, 6, 2, 9, 12], index=['Cochise County', 'Pima County', 'Santa Cruz County', 'Maricopa County', 'Yuma County'])
floodingReports
'''
Cochise County 5
Pima County 6
Santa Cruz County 2
Maricopa County 9
Yuma County 12
dtype: int64
'''
floodingReports['Cochise County']
# 5
floodingReports[floodingReports > 6]
'''
Maricopa County 9
Yuma County 12
dtype: int64
'''
从字典中创建 Pandas 序列。
注意:执行此操作时,字典的键将成为序列索引。
# 创建字典
fireReports_dict = {'Cochise County': 12, 'Pima County': 342, 'Santa Cruz County': 13, 'Maricopa County': 42, 'Yuma County' : 52}
# 将字典转换为 pd.Series,然后查看它
fireReports = pd.Series(fireReports_dict); fireReports
'''
Cochise County 12
Maricopa County 42
Pima County 342
Santa Cruz County 13
Yuma County 52
dtype: int64
'''
fireReports.index = ["Cochice", "Pima", "Santa Cruz", "Maricopa", "Yuma"]
fireReports
'''
Cochice 12
Pima 42
Santa Cruz 342
Maricopa 13
Yuma 52
dtype: int64
'''
数据帧 101
数据帧就像 R 的数据帧。
# 从等长列表或 NumPy 数组的字典中创建数据帧
data = {'county': ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data)
df
|
county |
reports |
year |
0 |
Cochice |
4 |
2012 |
1 |
Pima |
24 |
2012 |
2 |
Santa Cruz |
31 |
2013 |
3 |
Maricopa |
2 |
2014 |
4 |
Yuma |
3 |
2014 |
# 使用 columns 属性设置列的顺序
dfColumnOrdered = pd.DataFrame(data, columns=['county', 'year', 'reports'])
dfColumnOrdered
|
county |
year |
reports |
0 |
Cochice |
2012 |
4 |
1 |
Pima |
2012 |
24 |
2 |
Santa Cruz |
2013 |
31 |
3 |
Maricopa |
2014 |
2 |
4 |
Yuma |
2014 |
3 |
# 添加一列
dfColumnOrdered['newsCoverage'] = pd.Series([42.3, 92.1, 12.2, 39.3, 30.2])
dfColumnOrdered
|
county |
year |
reports |
newsCoverage |
0 |
Cochice |
2012 |
4 |
42.3 |
1 |
Pima |
2012 |
24 |
92.1 |
2 |
Santa Cruz |
2013 |
31 |
12.2 |
3 |
Maricopa |
2014 |
2 |
39.3 |
4 |
Yuma |
2014 |
3 |
30.2 |
# 删除一列
del dfColumnOrdered['newsCoverage']
dfColumnOrdered
|
county |
year |
reports |
0 |
Cochice |
2012 |
4 |
1 |
Pima |
2012 |
24 |
2 |
Santa Cruz |
2013 |
31 |
3 |
Maricopa |
2014 |
2 |
4 |
Yuma |
2014 |
3 |
# 转置数据帧
dfColumnOrdered.T
|
0 |
1 |
2 |
3 |
4 |
county |
Cochice |
Pima |
Santa Cruz |
Maricopa |
Yuma |
year |
2012 |
2012 |
2013 |
2014 |
2014 |
reports |
4 |
24 |
31 |
2 |
3 |
Pandas 时间序列基础
# 导入模块
from datetime import datetime
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as pyplot
data = {'date': ['2014-05-01 18:47:05.069722', '2014-05-01 18:47:05.119994', '2014-05-02 18:47:05.178768', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.230071', '2014-05-02 18:47:05.280592', '2014-05-03 18:47:05.332662', '2014-05-03 18:47:05.385109', '2014-05-04 18:47:05.436523', '2014-05-04 18:47:05.486877'],
'battle_deaths': [34, 25, 26, 15, 15, 14, 26, 25, 62, 41]}
df = pd.DataFrame(data, columns = ['date', 'battle_deaths'])
print(df)
'''
date battle_deaths
0 2014-05-01 18:47:05.069722 34
1 2014-05-01 18:47:05.119994 25
2 2014-05-02 18:47:05.178768 26
3 2014-05-02 18:47:05.230071 15
4 2014-05-02 18:47:05.230071 15
5 2014-05-02 18:47:05.280592 14
6 2014-05-03 18:47:05.332662 26
7 2014-05-03 18:47:05.385109 25
8 2014-05-04 18:47:05.436523 62
9 2014-05-04 18:47:05.486877 41
'''
df['date'] = pd.to_datetime(df['date'])
df.index = df['date']
del df['date']
df
|
battle_deaths |
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 查看 2014 年的所有观测
df['2014']
|
battle_deaths |
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 查看 2014 年 5 月的所有观测
df['2014-05']
|
battle_deaths |
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 查看 2014.5.3 的所有观测
df[datetime(2014, 5, 3):]
|
battle_deaths |
date |
|
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
Observations between May 3rd and May 4th
# 查看 2014.5.3~4 的所有观测
df['5/3/2014':'5/4/2014']
|
battle_deaths |
date |
|
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 截断 2014.5.2 之后的观测
df.truncate(after='5/3/2014')
|
battle_deaths |
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
# 2014.5 的观测
df['5-2014']
|
battle_deaths |
date |
|
2014-05-01 18:47:05.069722 |
34 |
2014-05-01 18:47:05.119994 |
25 |
2014-05-02 18:47:05.178768 |
26 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.230071 |
15 |
2014-05-02 18:47:05.280592 |
14 |
2014-05-03 18:47:05.332662 |
26 |
2014-05-03 18:47:05.385109 |
25 |
2014-05-04 18:47:05.436523 |
62 |
2014-05-04 18:47:05.486877 |
41 |
# 计算每个时间戳的观测数
df.groupby(level=0).count()
|
battle_deaths |
date |
|
2014-05-01 18:47:05.069722 |
1 |
2014-05-01 18:47:05.119994 |
1 |
2014-05-02 18:47:05.178768 |
1 |
2014-05-02 18:47:05.230071 |
2 |
2014-05-02 18:47:05.280592 |
1 |
2014-05-03 18:47:05.332662 |
1 |
2014-05-03 18:47:05.385109 |
1 |
2014-05-04 18:47:05.436523 |
1 |
2014-05-04 18:47:05.486877 |
1 |
# 每天的 battle_deaths 均值
df.resample('D').mean()
|
battle_deaths |
date |
|
2014-05-01 |
29.5 |
2014-05-02 |
17.5 |
2014-05-03 |
25.5 |
2014-05-04 |
51.5 |
# 每天的 battle_deaths 总数
df.resample('D').sum()
|
battle_deaths |
date |
|
2014-05-01 |
59 |
2014-05-02 |
70 |
2014-05-03 |
51 |
2014-05-04 |
103 |
# 绘制每天的总死亡人数
df.resample('D').sum().plot()
# <matplotlib.axes._subplots.AxesSubplot at 0x11187a940>