导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib
plt.rcParams["font.sans-serif"]=["SimHei"] #设置子体
plt.rcParams["axes.unicode_minus"] = False #正常显示负号
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
plt.style.use("ggplot")
sns.set(context="notebook",
style="darkgrid",
palette="colorblind",
font="sans-serif",
font_scale=1,
rc=None)
matplotlib.rcParams["figure.figsize"] = [8, 8]
matplotlib.rcParams.update({"font.size":15})
matplotlib.rcParams["font.family"] = "sans-serif"
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
# 忽略notebook中的警告
import warnings
warnings.filterwarnings("ignore")
数据基本信息
df1 = pd.read_csv("train.csv")
df1.head()
image.png
df1.shape
image.png
columns = df1.columns
columns
image.png
df1.dtypes
image.png
# 不同字段类型的占比
df1.dtypes.value_counts().plot.pie(explode=[0.1, 0.1, 0.1], autopct="%1.2f%%", shadow=True)
plt.title("type of our data")
plt.show()
image.png
# 字段基本信息
df1.info()
image.png
# 缺失值情况
df1.isnull().sum()
image.png
统计与可视化分析
# 性别分析
df2 = df1["Gender"].value_counts().reset_index()
df2
image.png
# 不同性别下的数量分布统计
colors = ["red", "blue"]
sns.countplot("Gender", data=df1, palette=colors)
plt.title("Gender Count")
plt.show()
image.png
# 不同性别下的数量占比统计
size = df1["Gender"].value_counts()
labels = ["Male", "Female"]
colors = ["#C4061D", "green"]
explode = [0, 0.1]
plt.rcParams["figure.figsize"] = (10, 10)
plt.pie(size,
colors=colors,
labels=labels,
shadow=True,
explode=explode,
autopct="%.2f%%")
plt.title("Gender Percent", fontsize=20)
plt.axis("off")
plt.legend()
plt.show()
image.png
来源:尤而小屋