Python Example Code
Function
Collect information about female netizens who have spent more than 2 hours in online shopping on the weekend.
Example Code
The following code segment is only an example. For details, see SparkSQLPythonExample.
# -*- coding:utf-8 -*- import sys from pyspark.sql import SparkSession from pyspark.sql import SQLContext def contains(str1, substr1): if substr1 in str1: return True return False if __name__ == "__main__": if len(sys.argv) < 2: print "Usage: SparkSQLPythonExample.py <file>" exit(-1) # Initialize the SparkSession and SQLContext. sc = SparkSession.builder.appName("CollectFemaleInfo").getOrCreate() sqlCtx = SQLContext(sc) #Convert RDD to DataFrame. inputPath = sys.argv[1] inputRDD = sc.read.text(inputPath).rdd.map(lambda r: r[0])\ .map(lambda line: line.split(","))\ .map(lambda dataArr: (dataArr[0], dataArr[1], int(dataArr[2])))\ .collect() df = sqlCtx.createDataFrame(inputRDD) # Register a table. df.registerTempTable("FemaleInfoTable") # Run SQL query statements and display the result. FemaleTimeInfo = sqlCtx.sql("SELECT * FROM " + "(SELECT _1 AS Name,SUM(_3) AS totalStayTime FROM FemaleInfoTable " + "WHERE _2 = 'female' GROUP BY _1 )" + " WHERE totalStayTime >120").show() sc.stop()
Feedback
Was this page helpful?
Provide feedbackThank you very much for your feedback. We will continue working to improve the documentation.