PySpark库

导入库

from pyspark import SparkConf, SparkContext
sc =SparkContext.getOrCreate()

创建RDD

data = sc.parallelize([('Amber', 22),
                       ('Alfred', 23),
                       ('Skye',4),
                       ('Albert', 12),
                       ('Amber', 9)])

data
--->ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175

.collect()函数：执行把全部数据集送回驱动的操作

data_heterogenous = sc.parallelize([('Ferrari', 'fast'),
                                    {'Porsche': 100000},
                                    ['Spain', 'visited', 4504]
                                   ]).collect()  #.collect()执行把该数据集送回驱动的操作

data_heterogenous
--->[('Ferrari', 'fast'), {'Porsche': 100000}, ['Spain', 'visited', 4504]]

索引其中某一个数值

data_heterogenous[1]['Porsche']
--->100000

读入压缩包文件

data_from_file = sc.textFile(r'D:\小鸡理财\OneDrive\python\book\PySpark实战指南\VS14MORT.txt.gz',4)  #4代表被划分为4个分区

data_from_file
--->D:\小鸡理财\OneDrive\python\book\PySpark实战指南\VS14MORT.txt.gz MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0

.take()：读取指定行数

data_from_file.take(1)
--->
    ['  1  2101  M1087 432311  4M4 2014U7CN I64 238 070   24 0111I64                                                                                                                                                                     01 I64 01  11 100 601']

示例函数

def extractInformation(row):
    import re
    import numpy as np

    selected_indices = [
         2,4,5,6,7,9,10,11,12,13,14,15,16,17,18,
         19,21,22,23,24,25,27,28,29,30,32,33,34,
         36,37,38,39,40,41,42,43,44,45,46,47,48,
         49,50,51,52,53,54,55,56,58,60,61,62,63,
         64,65,66,67,68,69,70,71,72,73,74,75,76,
         77,78,79,81,82,83,84,85,87,89
    ]
    
    record_split = re.compile(
            r'([\s]{19})([0-9]{1})([\s]{40})([0-9\s]{2})([0-9\s]{1})([0-9]{1})([0-9]{2})' + 
            r'([\s]{2})([FM]{1})([0-9]{1})([0-9]{3})([0-9\s]{1})([0-9]{2})([0-9]{2})' + 
            r'([0-9]{2})([0-9\s]{2})([0-9]{1})([SMWDU]{1})([0-9]{1})([\s]{16})([0-9]{4})' +
            r'([YNU]{1})([0-9\s]{1})([BCOU]{1})([YNU]{1})([\s]{34})([0-9\s]{1})([0-9\s]{1})' +
            r'([A-Z0-9\s]{4})([0-9]{3})([\s]{1})([0-9\s]{3})([0-9\s]{3})([0-9\s]{2})([\s]{1})' + 
            r'([0-9\s]{2})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' + 
            r'([A-Z0-9\s]{7})([\s]{36})([A-Z0-9\s]{2})([\s]{1})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' + 
            r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([\s]{1})([0-9\s]{2})([0-9\s]{1})' + 
            r'([0-9\s]{1})([0-9\s]{1})([0-9\s]{1})([\s]{33})([0-9\s]{3})([0-9\s]{1})([0-9\s]{1})')
    try:
        rs = np.array(record_split.split(row))[selected_indices]
    except:
        rs = np.array(['-99'] * len(selected_indices))
    return rs

map()转换：对每一个元素进行转换

data_from_file_conv = data_from_file.map(extractInformation)
# data_from_file_conv.map(lambda row: row).take(1)

data_2014 = data_from_file_conv.map(lambda x:int(x[16]))
data_2014.take(15)
--->
    [2014,
     2014,
     2014,
     2014,
     2014,
     2014,
     2014,
     2014,
     2014,
     -99,
     2014,
     2014,
     -99,
     2014,
     2014]

data_2014_2 = data_from_file_conv.map(lambda row: (row[16], int(row[16])))
data_2014_2.take(10)
--->
    [('2014', 2014),
     ('2014', 2014),
     ('2014', 2014),
     ('2014', 2014),
     ('2014', 2014),
     ('2014', 2014),
     ('2014', 2014),
     ('2014', 2014),
     ('2014', 2014),
     ('-99', -99)]

.filter()转换：从数据集中选择符合条件的数据

data_filtered = data_from_file_conv.filter(lambda row: row[16] == '2014' and row[21] == '0')
data_filtered.take(1)
--->
    [array(['2', '12', ' ', '0', '07', 'F', '1', '030', ' ', '32', '12', '05',
            '  ', '1', 'D', '6', '2014', 'N', '1', 'U', 'Y', '0', '9', 'X44 ',
            '420', '122', '   ', '39', '05', '11T391 ', '12X44  ', '13T401 ',
            '14T424 ', '61F199 ', '       ', '       ', '       ', '       ',
            '       ', '       ', '       ', '       ', '       ', '       ',
            '       ', '       ', '       ', '       ', '       ', '05',
            'X44  ', 'F199 ', 'T391 ', 'T401 ', 'T424 ', '     ', '     ',
            '     ', '     ', '     ', '     ', '     ', '     ', '     ',
            '     ', '     ', '     ', '     ', '     ', '     ', '01', ' ',
            ' ', '1', '1', '100', '6'],
           dtype='<U40')]

.count() ：统计个数

data_filtered.count()
--->22

.flatMap() : 返回扁平的结果，与.fliter()类似

data_2014_flat = data_from_file_conv.flatMap(lambda row: (row[16], int(row[16]) + 1))
data_2014_flat.take(10)
--->['2014', 2015, '2014', 2015, '2014', 2015, '2014', 2015, '2014', 2015]

.distinct()：返回唯一值个数

distinct_gender = data_from_file_conv.map(lambda row:row[5]).distinct()
distinct_gender.collect()
--->['M', 'F', '-99']

.sample()：返回随机样本集

fraction = 0.1
data_sample = data_from_file_conv.sample(False, fraction, 666)
#第一参数指定采用是否替换，第二参数指定样本占全体的比例，第三参数是伪随机数的种子

data_sample.take(1)
--->
    [array(['1', '  ', '5', '1', '01', 'F', '1', '082', ' ', '42', '22', '10',
            '  ', '4', 'W', '5', '2014', 'U', '7', 'C', 'N', ' ', ' ', 'I251',
            '215', '063', '   ', '21', '02', '11I350 ', '21I251 ', '       ',
            '       ', '       ', '       ', '       ', '       ', '       ',
            '       ', '       ', '       ', '       ', '       ', '       ',
            '       ', '       ', '       ', '       ', '       ', '02',
            'I251 ', 'I350 ', '     ', '     ', '     ', '     ', '     ',
            '     ', '     ', '     ', '     ', '     ', '     ', '     ',
            '     ', '     ', '     ', '     ', '     ', '     ', '28', ' ',
            ' ', '2', '4', '100', '8'],
           dtype='<U40')]

.leftOuterJoin()：左连接

rdd1 = sc.parallelize([('a', 1), ('b', 4), ('c',10)])
rdd2 = sc.parallelize([('a', 4), ('a', 1), ('b', '6'), ('d', 15)])

rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3.collect()
--->[('b', (4, '6')), ('c', (10, None)), ('a', (1, 4)), ('a', (1, 1))]

.join()：内连接

rdd4 = rdd1.join(rdd2)
rdd4.collect()
--->[('b', (4, '6')), ('a', (1, 4)), ('a', (1, 1))]

----------------------------------------------------分割线----------------------------------------------------

查找重复值并删除

导入库

from pyspark.sql import SQLContext
spark = SQLContext(sc)

创建DataFrame

df = spark.createDataFrame([
        (1, 144.5, 5.9, 33, 'M'),
        (2, 167.2, 5.4, 45, 'M'),
        (3, 124.1, 5.2, 23, 'F'),
        (4, 144.5, 5.9, 33, 'M'),
        (5, 133.2, 5.7, 54, 'F'),
        (3, 124.1, 5.2, 23, 'F'),
        (5, 129.2, 5.3, 42, 'M'),
    ], ['id', 'weight', 'height', 'age', 'gender'])

.show()：打印

df.show()
--->
    +---+------+------+---+------+
    | id|weight|height|age|gender|
    +---+------+------+---+------+
    |  1| 144.5|   5.9| 33|     M|
    |  2| 167.2|   5.4| 45|     M|
    |  3| 124.1|   5.2| 23|     F|
    |  4| 144.5|   5.9| 33|     M|
    |  5| 133.2|   5.7| 54|     F|
    |  3| 124.1|   5.2| 23|     F|
    |  5| 129.2|   5.3| 42|     M|
    +---+------+------+---+------+

df.count()
--->7

df.distinct().count()   #有两行整行数据都是一样的
--->6

.drop_duplicates()：删除重复行

df = df.drop_duplicates()
df.show()
--->
    +---+------+------+---+------+
    | id|weight|height|age|gender|
    +---+------+------+---+------+
    |  5| 133.2|   5.7| 54|     F|
    |  5| 129.2|   5.3| 42|     M|
    |  1| 144.5|   5.9| 33|     M|
    |  4| 144.5|   5.9| 33|     M|
    |  2| 167.2|   5.4| 45|     M|
    |  3| 124.1|   5.2| 23|     F|
    +---+------+------+---+------+

df.distinct().count()
--->6

.select()：选择特定条件的列

df.select([
    c for c in df.columns if c != 'id'
]).distinct().count()
--->5

df.select(['id']).distinct().count()
--->5

#只查找指定的列
df = df.drop_duplicates(subset = [
    c for c in df.columns if c != 'id'
])
df.show()
--->
    +---+------+------+---+------+
    | id|weight|height|age|gender|
    +---+------+------+---+------+
    |  5| 133.2|   5.7| 54|     F|
    |  1| 144.5|   5.9| 33|     M|
    |  2| 167.2|   5.4| 45|     M|
    |  3| 124.1|   5.2| 23|     F|
    |  5| 129.2|   5.3| 42|     M|
    +---+------+------+---+------+

.agg()

import pyspark.sql.functions as fn

df.agg(fn.count('id').alias('count'),
      fn.countDistinct('id').alias('distinct')).show()
--->
    +-----+--------+
    |count|distinct|
    +-----+--------+
    |    5|       4|
    +-----+--------+

.monotonically_increasing_id()：给每条记录提供唯一且递增的ID

df.withColumn('new_id',fn.monotonically_increasing_id()).show()
--->
    +---+------+------+---+------+-------------+
    | id|weight|height|age|gender|       new_id|
    +---+------+------+---+------+-------------+
    |  5| 133.2|   5.7| 54|     F|  25769803776|
    |  1| 144.5|   5.9| 33|     M| 171798691840|
    |  2| 167.2|   5.4| 45|     M| 592705486848|
    |  3| 124.1|   5.2| 23|     F|1236950581248|
    |  5| 129.2|   5.3| 42|     M|1365799600128|
    +---+------+------+---+------+-------------+

----------------------------------------------------分割线----------------------------------------------------

缺失值处理

导入库

from pyspark import SparkConf, SparkContext
sc =SparkContext.getOrCreate()

from pyspark.sql import SQLContext
spark = SQLContext(sc)

示例数据

df_miss = spark.createDataFrame([
        (1, 143.5, 5.6, 28,   'M',  100000),
        (2, 167.2, 5.4, 45,   'M',  None),
        (3, None , 5.2, None, None, None),
        (4, 144.5, 5.9, 33,   'M',  None),
        (5, 133.2, 5.7, 54,   'F',  None),
        (6, 124.1, 5.2, None, 'F',  None),
        (7, 129.2, 5.3, 42,   'M',  76000),
    ], ['id', 'weight', 'height', 'age', 'gender', 'income'])

df_miss.show()
--->
    +---+------+------+----+------+------+
    | id|weight|height| age|gender|income|
    +---+------+------+----+------+------+
    |  1| 143.5|   5.6|  28|     M|100000|
    |  2| 167.2|   5.4|  45|     M|  null|
    |  3|  null|   5.2|null|  null|  null|
    |  4| 144.5|   5.9|  33|     M|  null|
    |  5| 133.2|   5.7|  54|     F|  null|
    |  6| 124.1|   5.2|null|     F|  null|
    |  7| 129.2|   5.3|  42|     M| 76000|
    +---+------+------+----+------+------+

计算每行的空值个数

df_miss.rdd.map(lambda row:(row['id'],sum([c == None for c in row]))).collect()
--->[(1, 0), (2, 1), (3, 4), (4, 1), (5, 1), (6, 2), (7, 0)]

从前面的结果中查看第3行的值

df_miss.where('id == 3').show()
--->
    +---+------+------+----+------+------+
    | id|weight|height| age|gender|income|
    +---+------+------+----+------+------+
    |  3|  null|   5.2|null|  null|  null|
    +---+------+------+----+------+------+

计算每列的空值比例

import pyspark.sql.functions as fn

df_miss.agg(*[
    (1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
    for c in df_miss.columns
]).show()
--->
    +----------+------------------+--------------+------------------+------------------+------------------+
    |id_missing|    weight_missing|height_missing|       age_missing|    gender_missing|    income_missing|
    +----------+------------------+--------------+------------------+------------------+------------------+
    |       0.0|0.1428571428571429|           0.0|0.2857142857142857|0.1428571428571429|0.7142857142857143|
    +----------+------------------+--------------+------------------+------------------+------------------+

去除'income'列

df_miss_no_income = df_miss.select([c for c in df_miss.columns if c != 'income'])
df_miss_no_income.show()
--->
    +---+------+------+----+------+
    | id|weight|height| age|gender|
    +---+------+------+----+------+
    |  1| 143.5|   5.6|  28|     M|
    |  2| 167.2|   5.4|  45|     M|
    |  3|  null|   5.2|null|  null|
    |  4| 144.5|   5.9|  33|     M|
    |  5| 133.2|   5.7|  54|     F|
    |  6| 124.1|   5.2|null|     F|
    |  7| 129.2|   5.3|  42|     M|
    +---+------+------+----+------+

去除空值个数大于3个的行

df_miss_no_income.dropna(thresh=3).show()
--->
    +---+------+------+----+------+
    | id|weight|height| age|gender|
    +---+------+------+----+------+
    |  1| 143.5|   5.6|  28|     M|
    |  2| 167.2|   5.4|  45|     M|
    |  4| 144.5|   5.9|  33|     M|
    |  5| 133.2|   5.7|  54|     F|
    |  6| 124.1|   5.2|null|     F|
    |  7| 129.2|   5.3|  42|     M|
    +---+------+------+----+------+

用平均值填充空值（除字符串列）

#计算数值列的平均值并转化为字典
means = df_miss_no_income.agg(
    *[fn.mean(c).alias(c) for c in df_miss_no_income.columns if c != 'gender']
).toPandas().to_dict('records')[0]

#补充字符串列的替代值到字典
means['gender'] = 'missing'

#打印字典
print('means:',means)
--->means: {'id': 4.0, 'weight': 140.28333333333333, 'height': 5.4714285714285706, 'age': 40.399999999999999, 'gender': 'missing'}

#填充空值
df_miss_no_income.fillna(means).show()
--->
    +---+------------------+------+---+-------+
    | id|            weight|height|age| gender|
    +---+------------------+------+---+-------+
    |  1|             143.5|   5.6| 28|      M|
    |  2|             167.2|   5.4| 45|      M|
    |  3|140.28333333333333|   5.2| 40|missing|
    |  4|             144.5|   5.9| 33|      M|
    |  5|             133.2|   5.7| 54|      F|
    |  6|             124.1|   5.2| 40|      F|
    |  7|             129.2|   5.3| 42|      M|
    +---+------------------+------+---+-------+

----------------------------------------------------分割线----------------------------------------------------

离群值处理

示例数据

df_outliers = spark.createDataFrame([
        (1, 143.5, 5.3, 28),
        (2, 154.2, 5.5, 45),
        (3, 342.3, 5.1, 99),
        (4, 144.5, 5.5, 33),
        (5, 133.2, 5.4, 54),
        (6, 124.1, 5.1, 21),
        (7, 129.2, 5.3, 42),
    ], ['id', 'weight', 'height', 'age'])

df_outliers.show()
--->
    +---+------+------+---+
    | id|weight|height|age|
    +---+------+------+---+
    |  1| 143.5|   5.3| 28|
    |  2| 154.2|   5.5| 45|
    |  3| 342.3|   5.1| 99|
    |  4| 144.5|   5.5| 33|
    |  5| 133.2|   5.4| 54|
    |  6| 124.1|   5.1| 21|
    |  7| 129.2|   5.3| 42|
    +---+------+------+---+

计算数据列的离群边界值

cols = ['weight', 'height', 'age']
bounds = {}

for col in cols:
    quantiles = df_outliers.approxQuantile(col, [0.25, 0.75], 0.05)
    IQR = quantiles[1] - quantiles[0]
    bounds[col] = [quantiles[0] - 1.5 * IQR, quantiles[1] + 1.5 * IQR]
    
print('bounds:',bounds)
--->bounds: {'weight': [91.69999999999999, 191.7], 'height': [4.499999999999999, 6.1000000000000005], 'age': [-11.0, 93.0]}

计算示例数据与其边界值的关系

outliers = df_outliers.select(*['id'] + [
    (
        (df_outliers[c] < bounds[c][0]) | 
        (df_outliers[c] > bounds[c][1])
    ).alias(c + '_o') for c in cols
])
outliers.show()
--->
    +---+--------+--------+-----+
    | id|weight_o|height_o|age_o|
    +---+--------+--------+-----+
    |  1|   false|   false|false|
    |  2|   false|   false|false|
    |  3|    true|   false| true|
    |  4|   false|   false|false|
    |  5|   false|   false|false|
    |  6|   false|   false|false|
    |  7|   false|   false|false|
    +---+--------+--------+-----+

离群值提取

#将示例数据和边界值判断数据合并
df_outliers = df_outliers.join(outliers, on='id')

#提取'weight'列的离群值
df_outliers.filter('weight_o').select('id', 'weight').show()
--->

#提取'age'列的离群值
# df_outliers.filter('age_o').select('id', 'age').show()

----------------------------------------------------分割线----------------------------------------------------

查看描述性数据

import pyspark.sql.types as typ

从csv导入数据

fraud = sc.textFile(r'D:\小鸡理财\OneDrive\python\Jupyter\spark\ccFraud.csv')
header = fraud.first()

fraud = fraud.filter(lambda row: row != header).map(lambda row: [int(elem) for elem in row.split(',')])

fraud.take(1)
--->[[1, 1, 35, 1, 3000, 4, 14, 2, 0]]

创建数据帧

fields = [
    *[
        typ.StructField(h[1:-1], typ.IntegerType(), True)
        for h in header.split(',')
    ]
]
print('fields:',fields)
--->fields: [StructField(custID,IntegerType,true), StructField(gender,IntegerType,true), StructField(state,IntegerType,true), StructField(cardholder,IntegerType,true), StructField(balance,IntegerType,true), StructField(numTrans,IntegerType,true), StructField(numIntlTrans,IntegerType,true), StructField(creditLine,IntegerType,true), StructField(fraudRisk,IntegerType,true)]

schema = typ.StructType(fields)
print('schema:',schema)
--->schema: StructType(List(StructField(custID,IntegerType,true),StructField(gender,IntegerType,true),StructField(state,IntegerType,true),StructField(cardholder,IntegerType,true),StructField(balance,IntegerType,true),StructField(numTrans,IntegerType,true),StructField(numIntlTrans,IntegerType,true),StructField(creditLine,IntegerType,true),StructField(fraudRisk,IntegerType,true)))

fraud_df = spark.createDataFrame(fraud, schema)

fraud_df.printSchema()
--->
    root
     |-- custID: integer (nullable = true)
     |-- gender: integer (nullable = true)
     |-- state: integer (nullable = true)
     |-- cardholder: integer (nullable = true)
     |-- balance: integer (nullable = true)
     |-- numTrans: integer (nullable = true)
     |-- numIntlTrans: integer (nullable = true)
     |-- creditLine: integer (nullable = true)
     |-- fraudRisk: integer (nullable = true)

fraud_df.show()
--->
    +------+------+-----+----------+-------+--------+------------+----------+---------+
    |custID|gender|state|cardholder|balance|numTrans|numIntlTrans|creditLine|fraudRisk|
    +------+------+-----+----------+-------+--------+------------+----------+---------+
    |     1|     1|   35|         1|   3000|       4|          14|         2|        0|
    |     2|     2|    2|         1|      0|       9|           0|        18|        0|
    |     3|     2|    2|         1|      0|      27|           9|        16|        0|
    |     4|     1|   15|         1|      0|      12|           0|         5|        0|
    |     5|     1|   46|         1|      0|      11|          16|         7|        0|
    |     6|     2|   44|         2|   5546|      21|           0|        13|        0|
    |     7|     1|    3|         1|   2000|      41|           0|         1|        0|
    |     8|     1|   10|         1|   6016|      20|           3|         6|        0|
    |     9|     2|   32|         1|   2428|       4|          10|        22|        0|
    |    10|     1|   23|         1|      0|      18|          56|         5|        0|
    |    11|     1|   46|         1|   4601|      54|           0|         4|        0|
    |    12|     1|   10|         1|   3000|      20|           0|         2|        0|
    |    13|     1|    6|         1|      0|      45|           2|         4|        0|
    |    14|     2|   38|         1|   9000|      41|           3|         8|        0|
    |    15|     1|   27|         1|   5227|      60|           0|        17|        0|
    |    16|     1|   44|         1|      0|      22|           0|         5|        0|
    |    17|     2|   18|         1|  13970|      20|           0|        13|        0|
    |    18|     1|   35|         1|   3113|      13|           6|         8|        0|
    |    19|     1|    5|         1|   9000|      20|           2|         8|        0|
    |    20|     2|   31|         1|   1860|      21|          10|         8|        0|
    +------+------+-----+----------+-------+--------+------------+----------+---------+
    only showing top 20 rows

查看性别的个数

fraud_df.groupby('gender').count().show()
--->
    +------+-------+
    |gender|  count|
    +------+-------+
    |     1|6178231|
    |     2|3821769|
    +------+-------+

查看每列的描述性统计

numerical = ['balance', 'numTrans', 'numIntlTrans']
desc = fraud_df.describe(numerical)
desc.show()
--->
    +-------+-----------------+------------------+-----------------+
    |summary|          balance|          numTrans|     numIntlTrans|
    +-------+-----------------+------------------+-----------------+
    |  count|         10000000|          10000000|         10000000|
    |   mean|     4109.9199193|        28.9351871|        4.0471899|
    | stddev|3996.847309737258|26.553781024523122|8.602970115863904|
    |    min|                0|                 0|                0|
    |    max|            41485|               100|               60|
    +-------+-----------------+------------------+-----------------+

----------------------------------------------------分割线----------------------------------------------------

可视化

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

fraud_df.select('balance').show()
--->
    +-------+
    |balance|
    +-------+
    |   3000|
    |      0|
    |      0|
    |      0|
    |      0|
    |   5546|
    |   2000|
    |   6016|
    |   2428|
    |      0|
    |   4601|
    |   3000|
    |      0|
    |   9000|
    |   5227|
    |      0|
    |  13970|
    |   3113|
    |   9000|
    |   1860|
    +-------+
    only showing top 20 rows

绘制直方图

hists = fraud_df.select('balance').rdd.flatMap(lambda row: row).histogram(20)

data = {
    'bins': hists[0][:-1],
    'freq': hists[1]
}

fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(1, 1, 1)
ax.bar(data['bins'], data['freq'], width=2000)
ax.set_title('Histogram of \'balance\'')

plt.savefig('B05793_05_22.png', dpi=300)

最后编辑于：2018.07.31 18:02:00

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 210,978评论 6赞 490
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 89,954评论 2赞 384
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 156,623评论 0赞 345
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 56,324评论 1赞 282
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 65,390评论 5赞 384
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 49,741评论 1赞 289
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,892评论 3赞 405
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 37,655评论 0赞 266
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 44,104评论 1赞 303
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 36,451评论 2赞 325
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 38,569评论 1赞 340
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 34,254评论 4赞 328
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,834评论 3赞 312
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 30,725评论 0赞 21
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,950评论 1赞 264
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 46,260评论 2赞 360
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 43,446评论 2赞 348

PySpark库

导入库

创建RDD

.collect()函数：执行把全部数据集送回驱动的操作

索引其中某一个数值

读入压缩包文件

.take()：读取指定行数

示例函数

map()转换：对每一个元素进行转换

.filter()转换：从数据集中选择符合条件的数据

.count() ：统计个数

.flatMap() : 返回扁平的结果，与.fliter()类似

.distinct()：返回唯一值个数

.sample()：返回随机样本集

.leftOuterJoin()：左连接

.join()：内连接

----------------------------------------------------分割线----------------------------------------------------

查找重复值并删除

导入库

创建DataFrame

.show()：打印

.drop_duplicates()：删除重复行

.select()：选择特定条件的列

.agg()

.monotonically_increasing_id()：给每条记录提供唯一且递增的ID

----------------------------------------------------分割线----------------------------------------------------

缺失值处理

导入库

示例数据

计算每行的空值个数

从前面的结果中查看第3行的值

计算每列的空值比例

去除'income'列

去除空值个数大于3个的行

用平均值填充空值（除字符串列）

----------------------------------------------------分割线----------------------------------------------------

离群值处理

示例数据

计算数据列的离群边界值

计算示例数据与其边界值的关系

离群值提取

----------------------------------------------------分割线----------------------------------------------------

查看描述性数据

从csv导入数据

创建数据帧

查看性别的个数

查看每列的描述性统计

----------------------------------------------------分割线----------------------------------------------------

可视化

绘制直方图

推荐阅读更多精彩内容