spark3.0-SparkSQL 读取 HDFS 文件 作者:马育民 • 2021-12-11 14:24 • 阅读:10208 # 说明 spark sql 读取 HDFS 文件 # 准备数据 ``` 1,100,2020-01-01 09:30:50,2020-01-01 09:40:50 2,200,2020-01-11 10:40:50,2020-01-11 11:05:50 3,150,2020-01-01 11:20:50,2020-01-01 11:32:50 1,300,2020-01-03 14:30:50,2020-01-03 15:20:10 1,50,2020-02-02 12:30:50,2020-02-02 12:35:50 2,500,2020-02-02 15:30:30,2020-02-02 17:20:30 3,150,2020-02-01 11:20:50,2020-02-01 11:32:50 3,150,2020-02-03 11:20:50,2020-02-03 11:32:50 1,100,2020-03-01 09:30:50,2020-03-01 09:40:50 1,100,2020-03-05 09:30:50,2020-03-05 09:40:50 2,500,2020-03-12 15:30:30,2020-03-12 17:30:30 2,500,2020-03-22 15:30:30,2020-03-22 17:30:30 3,400,2020-03-02 17:30:20,2020-03-02 18:20:20 4,100,2020-04-01 11:20:50,2020-04-01 11:25:50 4,50,2020-04-21 11:20:50,2020-04-21 11:28:50 5,150,2020-04-11 11:20:50,2020-04-11 11:32:50 6,150,2020-04-15 11:20:50,2020-04-15 11:40:50 ``` # scala ``` import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, SparkSession} object Std2_HDFS { def main(args: Array[String]): Unit = { val sc = new SparkConf().setMaster("local[*]").setAppName("std") //创建SparkSession val ss: SparkSession = SparkSession.builder().config(sc).getOrCreate() val df: DataFrame = ss.read .option("header", "false") //默认为false,表示第一行不是表头 .option("sep", ",") //列分隔符 .csv("hdfs://hadoop1:8020/chongdianzhuang.txt") //读取HDFS文件 .withColumnRenamed("_c0", "id") //给列起别名 .withColumnRenamed("_c1", "ele_num") .withColumnRenamed("_c2", "start_time") .withColumnRenamed("_c3", "end_time") //显示所有数据 // df.show() //创建局部临时视图(只在当前session中可用,视图:只能查询不能修改) df.createOrReplaceTempView("cdz") val resDF: DataFrame = ss.sql("select id,ele_num,start_time,end_time from cdz where ele_num>200") resDF.show() // println("统计各充电桩的充电量") // val frame: DataFrame = ss.sql("select id,sum(ele_num) as `充电量` from cdz group by id") // frame.rdd.coalesce(1).saveAsTextFile("res2") ss.stop() //关闭 } } ``` 原文出处:http://malaoshi.top/show_1IX2O4qqtTO6.html