Updated on 2024-08-10 GMT+08:00

Spark SQL Sample Projects (Java)

Function

Collect statistics on female netizens who dwell on online shopping for more than two hours during weekends.

Sample Code

The following code snippets are used as an example. For complete codes, see the com.huawei.bigdata.spark.examples.FemaleInfoCollection class.

public static void main(String[] args) throws Exception {
SparkSession spark = SparkSession
      .builder()
      .appName("CollectFemaleInfo")
      .config("spark.some.config.option", "some-value")
      .getOrCreate();

        // Convert RDD to DataFrame through the implicit conversion.
        JavaRDD<FemaleInfo> femaleInfoJavaRDD = spark.read().textFile(args[0]). javaRDD().map(
                new Function<String, FemaleInfo>() {
                    @Override
                    public FemaleInfo call(String line) throws Exception {
                        String[] parts = line.split(",");
                        FemaleInfo femaleInfo = new FemaleInfo();
                        femaleInfo.setName(parts[0]);
                        femaleInfo.setGender(parts[1]);
                        femaleInfo.setStayTime(Integer.parseInt(parts[2].trim()));
                        return femaleInfo;
                    }
                });

        // Register a table.
        Dataset<ROW> schemaFemaleInfo = spark.createDataFrame(femaleInfoJavaRDD,FemaleInfo.class);
        schemaFemaleInfo.registerTempTable("FemaleInfoTable");

        // Run the SQL query.
        Dataset<ROW> femaleTimeInfo = spark.sql("select * from " +
                "(select name,sum(stayTime) as totalStayTime from FemaleInfoTable " +
                "where gender = 'female' group by name )" +
                " tmp where totalStayTime >120");

       // Display the result.
        List<String> result = femaleTimeInfo.javaRDD().map(new Function<Row, String>() {
            public String call(Row row) {
                return  row.getString(0) + "," + row.getLong(1);
            }
        }).collect();
        System.out.println(result);
        spark.stop();
    }

For details about other Spark SQL features, visit http://spark.apache.org/docs/3.1.1/sql-programming-guide.html#running-sql-queries-programmatically.