wordcount实现的三种方式
第一种方式:
val df1 = spark.sql("select word,count(1) as word_cnt from (select explode(split(sentence, ' ')) as word from badou.wordcount) t group by word order by word_cnt desc")
第二种方式:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
val spark = SparkSession.builder().enableHiveSupport().appName("Word count df").getOrCreate()
val df = spark.sql("select sentence from badou.wordcount")
val split_udf = udf{ sentence:String =>
sentence.split(" ")
}
val wordcount = df.withColumn("split_words",split_udf(col("sentence"))).withColumn("word_explode",explode(col("split_words")))
.groupBy(col("word_explode")).count().withColumnRenamed("count","word_count").orderBy(desc("word_count"))
.select("word_explode","word_count")
wordcount.show()
第三种方式:
import org.apache.spark.sql.{DataFrame, SparkSession}
val spark = SparkSession.builder().appName("Word Count")
.enableHiveSupport()
.getOrCreate()
val df = spark.sql("select sentence from badou.wordcount")
val wordcount_sort = df.select("sentence").rdd.map(x=>x.toString)
.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
.sortBy(_._2,ascending = false).foreach(println)