赞
踩
本章所分析的数据来自于Kaggle公开的、人工合成的音乐专辑发行数据(https://www.kaggle.com/datasets/revilrosa/music-label-dataset)。以下,我们只针对albums.csv文件进行分析。该数据具体包括以下字段:
以下,我们将csv文件从HDFS中读取进来,并转换为Spark DataFrame格式。
- val spark = SparkSession
- .builder()
- .appName("Albums")
- .getOrCreate()
- import spark.implicits._
-
- val df = spark.read
- .option("header", "true")
- .option("inferSchema", "true")
- .csv("hdfs:///SparkLearning/albums.csv")
- val res = df
- .select($"genre")
- .groupBy($"genre")
- .count()
- .orderBy($"count".desc)
- val res = df
- .select($"genre", $"num_of_sales")
- .groupBy($"genre")
- .sum("num_of_sales")
- .withColumnRenamed("sum(num_of_sales)", "total_sales")
- .orderBy($"total_sales".desc)
- val res = df
- .select("year_of_pub", "num_of_tracks")
- .filter($"year_of_pub" >= 2000)
- .groupBy($"year_of_pub")
- .agg("num_of_tracks" -> "count", "num_of_tracks" -> "sum")
- .withColumnRenamed("count(num_of_tracks)", "total_albums")
- .withColumnRenamed("sum(num_of_tracks)", "total_tracks")
- .orderBy("year_of_pub")
- val res = df
- .select($"genre", $"num_of_sales")
- .groupBy("genre")
- .sum("num_of_sales")
- .withColumnRenamed("sum(num_of_sales)", "total_sales")
- .orderBy($"total_sales".desc)
- .limit(5)
- .alias("t1")
- .join(
- df.select($"genre", $"num_of_sales", $"year_of_pub").alias("t2"),
- $"t1.genre" === $"t2.genre"
- )
- .groupBy("t2.genre", "t2.year_of_pub")
- .sum("t2.num_of_sales")
- .orderBy($"genre", $"year_of_pub")
- val res = df
- .select($"genre", $"num_of_sales")
- .groupBy("genre")
- .sum("num_of_sales")
- .withColumnRenamed("sum(num_of_sales)", "total_sales")
- .orderBy($"total_sales".desc)
- .limit(5)
- .alias("t1")
- .join(
- df.select(
- $"genre",
- $"rolling_stone_critic",
- $"mtv_critic",
- $"music_maniac_critic"
- ).alias("t2"),
- $"t1.genre" === $"t2.genre"
- )
- .groupBy("t2.genre")
- .agg(
- "rolling_stone_critic" -> "avg",
- "mtv_critic" -> "avg",
- "music_maniac_critic" -> "avg"
- )
- .orderBy($"genre")

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。