在实际项目的开发过程中,不同Kafka主题的数据规模、数据频率,需要配置不同的Flume参数,而这一切的调试、配置工作,都要建立在对Flume配置文件各参数含义的基础上
#是否按照时间滚动文件夹
a1.sinks.k1.hdfs.round = true
#多少时间单位创建一个新的文件夹
a1.sinks.k1.hdfs.roundValue = 10
#重新定义时间单位
a1.sinks.k1.hdfs.roundUnit = second
#是否使用本地时间戳
a1.sinks.k1.hdfs.useLocalTimeStamp = true
#积攒多少个 Event 才 flush 到 HDFS 一次
a1.sinks.k1.hdfs.batchSize = 100
#设置文件类型,可支持压缩
a1.sinks.k1.hdfs.fileType = DataStream
#多久生成一个新的文件
a1.sinks.k1.hdfs.rollInterval = 60
#设置每个文件的滚动大小大概是 1M
a1.sinks.k1.hdfs.rollSize = 1024000
#文件的滚动与 Event 数量无关
a1.sinks.k1.hdfs.rollCount = 0
### Name agent, source, channels and sink alias
a1.sources = s1
a1.channels = c1
a1.sinks = k1
### define kafka source
a1.sources.s1.type = org.apache.flume.source.kafka.KafkaSource
# Maximum number of messages written to Channel in one batch
a1.sources.s1.batchSize = 5000
# Maximum time (in ms) before a batch will be written to Channel The batch will be written whenever the first of size and time will be reached.
a1.sources.s1.batchDurationMillis = 2000
# set kafka broker address
a1.sources.s1.kafka.bootstrap.servers = 192.168.0.27:9092
# set kafka consumer group Id and offset consume
# 官网推荐1.9.0版本只设置了topic,但测试后不能正常消费,需要添加消费组id(自己写一个),并定义偏移量消费方式
a1.sources.s1.kafka.consumer.group.id = evaluation_group
a1.sources.s1.kafka.consumer.auto.offset.reset = earliest
# set kafka topic
a1.sources.s1.kafka.topics = topic_b_evaluation
### defind hdfs sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hurys23:8020/user/hive/warehouse/hurys_dc_ods.db/ods_queue/day=%Y-%m-%d/
a1.sinks.k1.hdfs.filePrefix = queue
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = second
a1.sinks.k1.hdfs.rollSize = 10240000
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.rollInterval = 0
a1.sinks.k1.hdfs.idleTimeout = 60
a1.sinks.k1.hdfs.minBlockReplicas = 1
### define channel from kafka source to hdfs sink
# memoryChannel:快速,但是当设备断电,数据会丢失
# FileChannel:速度较慢,即使设备断电,数据也不会丢失
a1.channels.c1.type = file
# 这里不单独设置checkpointDir和dataDirs文件位置,参考官网不设置会有默认位置
# channel store size
a1.channels.c1.capacity = 100000
# transaction size
a1.channels.c1.transactionCapacity = 10000
### 绑定source、channel和sink
a1.sources.s1.channels = c1
a1.sinks.k1.channel = c1