赞
踩
在官方文档的描述中,API FlinkKafkaConsumer和FlinkKafkaProducer将在后续版本陆续弃用、移除,所以在未来生产中有版本升级的情况下,新API KafkaSource和KafkaSink还是有必要学会使用的。下面介绍下基于新API的一些自定义类以及主程序的简单实践。
官方文档地址:
https://nightlies.apache.org/flink/flink-docs-release-1.15/zh/docs/connectors/datastream/kafka/
自定义反序列化器可以以指定的格式取到来源Kafka消息中我们想要的元素。该类需要继承 KafkaDeserializationSchema ,这里简单将来源Kafka的topic、key、value以Tuple3[String, String, String]的格式取出来。
MyKafkaDeserializationSchemaTuple3.scala
- import org.apache.flink.api.common.typeinfo.TypeInformation
- import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema
- import org.apache.kafka.clients.consumer.ConsumerRecord
-
- import java.nio.charset.StandardCharsets
-
- /**
- * @author hushhhh
- */
- class MyKafkaDeserializationSchemaTuple3 extends KafkaDeserializationSchema[(String, String, String)] {
- override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]]): (String, String, String) = {
- new Tuple3[String, String, String](
- record.topic(),
- new String(record.key(), StandardCharsets.UTF_8),
- new String(record.value(), StandardCharsets.UTF_8))
- }
-
- override def isEndOfStream(nextElement: (String, String, String)): Boolean = false
-
- override def getProducedType: TypeInformation[(String, String, String)] = {
- TypeInformation.of(classOf[(String, String, String)])
- }
- }

自定义一个 TopicSelector 可以将流中多个topic里的数据根据一定逻辑分发到不同的目标topic里。该类需要继承 TopicSelector ,这里简单根据来源Kafka的topic名拼接下。
MyTopicSelector.scala
- import org.apache.flink.connector.kafka.sink.TopicSelector
-
- /**
- * @author hushhhh
- */
- class MyTopicSelector extends TopicSelector[(String, String, String)] {
- override def apply(t: (String, String, String)): String = {
- // t: 来源kafka的topic、key、value
- "TOPIC_" + t._1.toUpperCase()
- }
- }
自定义序列化器可以将数据根据自己的业务格式写到目标Kafka的key和value里,这里将来源Kafka里的key和value直接写出去,这两个类都需要继承 SerializationSchema 。
MyKeySerializationSchema.scala
- import org.apache.flink.api.common.serialization.SerializationSchema
-
- /**
- * @author hushhhh
- */
- class MyKeySerializationSchema extends SerializationSchema[(String, String, String)] {
- override def serialize(element: (String, String, String)): Array[Byte] = {
- // element: 来源kafka的topic、key、value
- element._2.getBytes()
- }
- }
MyValueSerializationSchema.scala
- import org.apache.flink.api.common.serialization.SerializationSchema
-
- /**
- * @author hushhhh
- */
- class MyValueSerializationSchema extends SerializationSchema[(String, String, String)] {
- override def serialize(element: (String, String, String)): Array[Byte] = {
- // element: 来源kafka的topic、key、value
- element._3.getBytes()
- }
- }
自定义分区器可以根据具体逻辑对要写到目标Kafka 里的数据进行partition分配。该类需要继承 FlinkKafkaPartitioner ,这里根据key的hash分配到不同的partition里(如果目标topic有多个partition的话)。
MyPartitioner.scala
- import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner
-
- /**
- * @author hushhhh
- */
- class MyPartitioner extends FlinkKafkaPartitioner[(String, String, String)] {
- override def partition(record: (String, String, String), key: Array[Byte], value: Array[Byte], targetTopic: String, partitions: Array[Int]): Int = {
- // record: 来源kafka的topic、key、value
- Math.abs(new String(record._2).hashCode % partitions.length)
- }
- }
Main.scala
- import format.{MyKafkaDeserializationSchemaTuple3, MyKeySerializationSchema, MyPartitioner, MyTopicSelector, MyValueSerializationSchema}
- import org.apache.flink.api.common.eventtime.WatermarkStrategy
- import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
- import org.apache.flink.api.scala._
- import org.apache.flink.connector.base.DeliveryGuarantee
- import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink}
- import org.apache.flink.connector.kafka.source.KafkaSource
- import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
- import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema
- import org.apache.kafka.clients.consumer.OffsetResetStrategy
-
- import java.util.Properties
- import scala.collection.JavaConverters._
-
- /**
- * @author hushhhh
- */
- object Main {
- def main(args: Array[String]): Unit = {
- /**
- * env
- */
- // stream环境
- val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
-
- /**
- * source
- */
- // 定义 KafkaSource
- lazy val kafkaSource: KafkaSource[(String, String, String)] = KafkaSource.builder()
- // Kafka消费者的各种配置文件,此处省略配置
- .setProperties(new Properties())
- // 配置消费的一个或多个topic
- .setTopics("sourceTopic1,sourceTopic2,...".split(",", -1).toList.asJava)
- // 开始消费位置,从已提交的offset开始消费,没有的话从最新的消息开始消费
- .setStartingOffsets(OffsetsInitializer.committedOffsets(OffsetResetStrategy.LATEST))
- // 反序列化,使用之前我们自定义的反序列化器
- .setDeserializer(KafkaRecordDeserializationSchema.of(new MyKafkaDeserializationSchemaTuple3))
- .build()
- // 添加 kafka source
- val inputDS: DataStream[(String, String, String)] = env.fromSource(
- kafkaSource,
- WatermarkStrategy.noWatermarks(),
- "MyKafkaSource")
- .setParallelism(1)
-
- /**
- * transformation
- */
- // 数据加工处理,此处省略
-
- /**
- * sink
- */
- // 定义 KafkaSink
- lazy val kafkaSink: KafkaSink[(String, String, String)] =
- KafkaSink.builder[(String, String, String)]()
- // 目标集群地址
- .setBootstrapServers("bootstrap.servers")
- // Kafka生产者的各种配置文件,此处省略配置
- .setKafkaProducerConfig(new Properties())
- // 定义消息的序列化模式
- .setRecordSerializer(KafkaRecordSerializationSchema.builder()
- // Topic选择器,使用之前我们自定义的Topic选择器
- .setTopicSelector(new MyTopicSelector)
- // Key的序列化器,使用之前我们自定义的Key序列化器
- .setKeySerializationSchema(new MyKeySerializationSchema)
- // Value的序列化器,使用之前我们自定义的Value序列化器
- .setValueSerializationSchema(new MyValueSerializationSchema)
- // 自定义分区器,使用之前我们自定义的自定义分区器
- .setPartitioner(new MyPartitioner)
- .build()
- )
- // 语义保证,保证至少一次
- .setDeliverGuarantee(DeliveryGuarantee.AT_LEAST_ONCE)
- .build()
-
- // 添加 kafka sink
- inputDS.sinkTo(kafkaSink)
- .name("MyKafkaSink")
- .setParallelism(1)
-
- /**
- * execute
- */
- env.execute("myJob")
- }
-
- }

以上就是KafkaSource和KafkaSink API的简单使用。大佬们感觉有用的话点个赞吧~
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。