Updated on 2024-10-23 GMT+08:00

Spark SQL Sample Projects (Python)

Function

Collect information about female netizens who have spent more than 2 hours in online shopping on the weekend.

Sample Code

The following code segment is only an example. For details, see SparkSQLPythonExample.

# -*- coding:utf-8 -*-

import sys
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext


def contains(str1, substr1):
    if substr1 in str1:
        return True
    return False


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "Usage: SparkSQLPythonExample.py <file>"
        exit(-1)

    # Initialize the SparkSession and SQLContext.
    sc = SparkSession.builder.appName("CollectFemaleInfo").getOrCreate()
    sqlCtx = SQLContext(sc)

    #Convert RDD to DataFrame. 
    inputPath = sys.argv[1]
    inputRDD = sc.read.text(inputPath).rdd.map(lambda r: r[0])\
        .map(lambda line: line.split(","))\
        .map(lambda dataArr: (dataArr[0], dataArr[1], int(dataArr[2])))\
        .collect()
    df = sqlCtx.createDataFrame(inputRDD)

    # Register a table. 
    df.registerTempTable("FemaleInfoTable")

    # Run SQL query statements and display the result. 
    FemaleTimeInfo = sqlCtx.sql("SELECT * FROM " +
               "(SELECT _1 AS Name,SUM(_3) AS totalStayTime FROM FemaleInfoTable " +
               "WHERE _2 = 'female' GROUP BY _1 )" +
               " WHERE totalStayTime >120").show()

    sc.stop()