Updated on 2024-08-10 GMT+08:00

Spark SQL Sample Projects (Scala)

Function

Collect statistics on female netizens who dwell on online shopping for more than two hours during weekends.

Sample Code

The following code snippets are used as an example. For complete codes, see the com.huawei.bigdata.spark.examples.FemaleInfoCollection class.

object FemaleInfoCollection
 {
  //Table structure, used for mapping the text data to df.
  case class FemaleInfo(name: String, gender: String, stayTime: Int)
  def main(args: Array[String]) {
    //Configure the Spark application name.
    val spark = SparkSession
      .builder()
      .appName("FemaleInfo")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()
    import spark.implicits._
    //Convert RDD to DataFrame through the implicit conversion, then register a table.
    spark.sparkContext.textFile(args(0)).map(_.split(","))
      .map(p => FemaleInfo(p(0), p(1), p(2).trim.toInt))
      .toDF.registerTempTable("FemaleInfoTable")
    //Use SQL statements to filter the data information of the time that female netizens spend online, and aggregate data of the same name.
    val femaleTimeInfo = spark.sql("select name,sum(stayTime) as stayTime from FemaleInfoTable where 
gender = 'female' group by name")
    //Filter the information of female netizens who spend more than 2 hours online and output the result.
    val c = femaleTimeInfo.filter("stayTime >= 120").collect().foreach(println)
    spark.stop()
  }
}

For details about other Spark SQL features, visit http://spark.apache.org/docs/3.1.1/sql-programming-guide.html#running-sql-queries-programmatically.