导入库

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib

plt.rcParams["font.sans-serif"]=["SimHei"]  #设置子体
plt.rcParams["axes.unicode_minus"] = False  #正常显示负号

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot

plt.style.use("ggplot")
sns.set(context="notebook", 
        style="darkgrid",
        palette="colorblind", 
        font="sans-serif", 
        font_scale=1, 
        rc=None)

matplotlib.rcParams["figure.figsize"] = [8, 8]
matplotlib.rcParams.update({"font.size":15})
matplotlib.rcParams["font.family"] = "sans-serif"

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

# 忽略notebook中的警告
import warnings
warnings.filterwarnings("ignore")

数据基本信息

df1 = pd.read_csv("train.csv")
df1.head()

image.png

df1.shape

image.png

columns = df1.columns
columns

image.png

df1.dtypes

image.png

# 不同字段类型的占比
df1.dtypes.value_counts().plot.pie(explode=[0.1, 0.1, 0.1], autopct="%1.2f%%", shadow=True)
plt.title("type of our data")

plt.show()

image.png

# 字段基本信息
df1.info()

image.png

# 缺失值情况
df1.isnull().sum()

image.png

统计与可视化分析

# 性别分析
df2 = df1["Gender"].value_counts().reset_index()
df2

image.png

# 不同性别下的数量分布统计
colors = ["red", "blue"]

sns.countplot("Gender", data=df1, palette=colors)
plt.title("Gender Count")

plt.show()

image.png

# 不同性别下的数量占比统计
size = df1["Gender"].value_counts()

labels = ["Male", "Female"]
colors = ["#C4061D", "green"]
explode = [0, 0.1]

plt.rcParams["figure.figsize"] = (10, 10)
plt.pie(size,
       colors=colors,
       labels=labels,
       shadow=True,
       explode=explode,
       autopct="%.2f%%")

plt.title("Gender Percent", fontsize=20)
plt.axis("off")
plt.legend()
plt.show()

image.png

来源：尤而小屋

12黑色星期五回归模型预测

12黑色星期五回归模型预测

导入库

数据基本信息

统计与可视化分析

推荐阅读更多精彩内容