更新时间:2022-07-19 GMT+08:00
Java样例代码
功能介绍
实时统计连续网购时间超过半个小时的女性网民信息,将统计结果直接打印或者输出写入到Kafka中。
Spark Streaming Write To Print代码样例
下面代码片段仅为演示,具体代码参见com.huawei.bigdata.spark.examples.FemaleInfoCollectionPrint:
// 参数解析: // <batchTime>为Streaming分批的处理间隔。 // <windowTime>为统计数据的时间跨度,时间单位都是秒。 // <topics>为Kafka中订阅的主题,多以逗号分隔。 // <brokers>为获取元数据的kafka地址。 public class FemaleInfoCollectionPrint { public static void main(String[] args) throws Exception { String batchTime = args[0]; final String windowTime = args[1]; String topics = args[2]; String brokers = args[3]; Duration batchDuration = Durations.seconds(Integer.parseInt(batchTime)); Duration windowDuration = Durations.seconds(Integer.parseInt(windowTime)); SparkConf conf = new SparkConf().setAppName("DataSightStreamingExample"); JavaStreamingContext jssc = new JavaStreamingContext(conf, batchDuration); // 设置Streaming的CheckPoint目录,由于窗口概念存在,该参数必须设置 jssc.checkpoint("checkpoint"); // 组装Kafka的主题列表 HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(","))); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", brokers); // 通过brokers和topics直接创建kafka stream // 1.接收Kafka中数据,生成相应DStream JavaDStream<String> lines = KafkaUtils.createDirectStream(jssc,String.class,String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet).map( new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } } ); // 2.获取每一个行的字段属性 JavaDStream<Tuple3<String, String, Integer>> records = lines.map( new Function<String, Tuple3<String, String, Integer>>() { @Override public Tuple3<String, String, Integer> call(String line) throws Exception { String[] elems = line.split(","); return new Tuple3<String, String, Integer>(elems[0], elems[1], Integer.parseInt(elems[2])); } } ); // 3.筛选女性网民上网时间数据信息 JavaDStream<Tuple2<String, Integer>> femaleRecords = records.filter(new Function<Tuple3<String, String, Integer>, Boolean>() { public Boolean call(Tuple3<String, String, Integer> line) throws Exception { if (line._2().equals("female")) { return true; } else { return false; } } }).map(new Function<Tuple3<String, String, Integer>, Tuple2<String, Integer>>() { public Tuple2<String, Integer> call(Tuple3<String, String, Integer> stringStringIntegerTuple3) throws Exception { return new Tuple2<String, Integer>(stringStringIntegerTuple3._1(), stringStringIntegerTuple3._3()); } }); // 4.汇总在一个时间窗口内每个女性上网时间 JavaPairDStream<String, Integer> aggregateRecords = JavaPairDStream.fromJavaDStream(femaleRecords) .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() { public Integer call(Integer integer, Integer integer2) throws Exception { return integer + integer2; } }, new Function2<Integer, Integer, Integer>() { public Integer call(Integer integer, Integer integer2) throws Exception { return integer - integer2; } }, windowDuration, batchDuration); JavaPairDStream<String, Integer> upTimeUser = aggregateRecords.filter(new Function<Tuple2<String, Integer>, Boolean>() { public Boolean call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception { if (stringIntegerTuple2._2() > 0.9 * Integer.parseInt(windowTime)) { return true; } else { return false; } } }); // 5.筛选连续上网时间超过阈值的用户,并获取结果 upTimeUser.print(); // 6.Streaming系统启动 jssc.start(); jssc.awaitTermination(); }
Spark Streaming Write To Kafka代码样例
下面代码片段仅为演示,具体代码参见com.huawei.bigdata.spark.examples.JavaDstreamKafkaWriter:
- Spark版本升级后,推荐使用新接口createDirectStream,老接口createStream仍然存在,但是性能和稳定性差,建议不要使用老接口开发应用程序。
- 该样例代码只存在于mrs-sample-project-1.6.0.zip中
// 参数解析: //<groupId> 消费者的group.id. //<brokers> broker的IP和端口. //<topic> kafka的topic. public class JavaDstreamKafkaWriter { public static void main(String[] args) throws InterruptedException { if (args.length != 3) { System.err.println("Usage: JavaDstreamKafkaWriter <groupId> <brokers> <topic>"); System.exit(1); } final String groupId = args[0]; final String brokers = args[1]; final String topic = args[2]; SparkConf sparkConf = new SparkConf().setAppName("KafkaWriter"); // 配置Kafka Properties kafkaParams = new Properties(); kafkaParams.put("metadata.broker.list", brokers); kafkaParams.put("group.id", groupId); kafkaParams.put("auto.offset.reset", "smallest"); // 创建一个Java streaming context JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(500)); // 向kafka发送数据 List<String> sentData = new ArrayList(); sentData.add("kafka_writer_test_msg_01"); sentData.add("kafka_writer_test_msg_02"); sentData.add("kafka_writer_test_msg_03"); // 创建RDD queue Queue<JavaRDD<String>> sent = new LinkedList(); sent.add(ssc.sparkContext().parallelize(sentData)); // 使用写入的数据创建Dstream JavaDStream wStream = ssc.queueStream(sent); // 写入Kafka JavaDStreamKafkaWriterFactory.fromJavaDStream(wStream).writeToKafka(kafkaParams, new Function<String, KeyedMessage<String, byte[]>>() { public KeyedMessage<String, byte[]> call(String s) { return new KeyedMessage(topic, s.getBytes()); } }); ssc.start(); ssc.awaitTermination(); } }
父主题: Spark Streaming程序