Updated on 2024-08-10 GMT+08:00

Spark SQL Sample Projects (Python)

Function

Collect statistics on female netizens who dwell on online shopping for more than two hours during weekends.

Sample Code

The following code snippets are used as an example. For complete codes, see the SparkSQLPythonExample class.

# -*- coding:utf-8 -*-

import sys
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext


def contains(str1, substr1):
    if substr1 in str1:
        return True
    return False


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "Usage: SparkSQLPythonExample.py <file>"
        exit(-1)

    # Initialize the SparkSession and SQLContext.
    sc = SparkSession.builder.appName("CollectFemaleInfo").getOrCreate()
    sqlCtx = SQLContext(sc)

    # Convert RDD to DataFrame.
    inputPath = sys.argv[1]
    inputRDD = sc.read.text(inputPath).rdd.map(lambda r: r[0])\
        .map(lambda line: line.split(","))\
        .map(lambda dataArr: (dataArr[0], dataArr[1], int(dataArr[2])))\
        .collect()
    df = sqlCtx.createDataFrame(inputRDD)

    # Register a table.
    df.registerTempTable("FemaleInfoTable")

    # Run SQL query statements and display the query result.
    FemaleTimeInfo = sqlCtx.sql("SELECT * FROM " +
               "(SELECT _1 AS Name,SUM(_3) AS totalStayTime FROM FemaleInfoTable " +
               "WHERE _2 = 'female' GROUP BY _1 )" +
               " WHERE totalStayTime >120").show()

    sc.stop()