Updated on 2024-10-23 GMT+08:00

Spark SQL Sample Projects (Scala)

Function

Collects the information of female netizens who spend more than 2 hours in online shopping on the weekend from the log files.

Sample Code

The following code segment is only an example. For details, see com.huawei.bigdata.spark.examples.FemaleInfoCollection.

object FemaleInfoCollection
 {
  //Table structure, used for mapping the text data to df 
  case class FemaleInfo(name: String, gender: String, stayTime: Int)
  def main(args: Array[String]) {
    //Configure Spark application name
    val spark = SparkSession
      .builder()
      .appName("FemaleInfo")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()
    import spark.implicits._
    //Convert RDD to DataFrame through the implicit conversion, then register table.
    spark.sparkContext.textFile(args(0)).map(_.split(","))
      .map(p => FemaleInfo(p(0), p(1), p(2).trim.toInt))
      .toDF.registerTempTable("FemaleInfoTable")
    //Use SQL statements to filter the data information of the time that female netizens spend online, and aggregate data of the same name.
    val femaleTimeInfo = spark.sql("select name,sum(stayTime) as stayTime from FemaleInfoTable where 
gender = 'female' group by name")
    //Filter information about female netizens who spend more than two hours online.
    val c = femaleTimeInfo.filter("stayTime >= 120").collect().foreach(println)
    spark.stop()
  }
}

For details about other Spark SQL features, visit http://archive.apache.org/dist/spark/docs/3.3.1/sql-programming-guide.html#running-sql-queries-programmatically.