Python Example Code
Function
Collect information about female netizens who have spent more than 2 hours in online shopping on the weekend.
Example Code
The following code segment is only an example. For details, see SparkSQLPythonExample.
# -*- coding:utf-8 -*-
import sys
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
def contains(str1, substr1):
if substr1 in str1:
return True
return False
if __name__ == "__main__":
if len(sys.argv) < 2:
print "Usage: SparkSQLPythonExample.py <file>"
exit(-1)
# Initialize the SparkSession and SQLContext.
sc = SparkSession.builder.appName("CollectFemaleInfo").getOrCreate()
sqlCtx = SQLContext(sc)
#Convert RDD to DataFrame.
inputPath = sys.argv[1]
inputRDD = sc.read.text(inputPath).rdd.map(lambda r: r[0])\
.map(lambda line: line.split(","))\
.map(lambda dataArr: (dataArr[0], dataArr[1], int(dataArr[2])))\
.collect()
df = sqlCtx.createDataFrame(inputRDD)
# Register a table.
df.registerTempTable("FemaleInfoTable")
# Run SQL query statements and display the result.
FemaleTimeInfo = sqlCtx.sql("SELECT * FROM " +
"(SELECT _1 AS Name,SUM(_3) AS totalStayTime FROM FemaleInfoTable " +
"WHERE _2 = 'female' GROUP BY _1 )" +
" WHERE totalStayTime >120").show()
sc.stop()
Feedback
Was this page helpful?
Provide feedbackThank you very much for your feedback. We will continue working to improve the documentation.