【Flink】Flink 数据处理流程及 scala 代码实现

1、Flink 启动运行流程

2、assignAscendingTimestamps 算子运行流程

3、assignTimestampsAndWatermarks 算子运行流程

4、aggregate 预聚合方法执行流程

5、热门商品点击 TopN

## HotItems.scala

import java.sql.Timestamp
import java.util.Properties

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer

// 输入数据
case class UserBehavior( userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long )
// 输出数据
case class ItemViewCount( itemId: Long, windowEnd: Long, count: Long )

object HotItems {
  def main(args: Array[String]): Unit = {

    // val properties = new Properties()

    // flink的流执行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment

    // 设定Time类型为EventTime
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    // 并发为1
    env.setParallelism(1)

    // 获得数据源的文件
    val stream = env
      .readTextFile("D:\\MyIdeaProjects\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")

      // 获得文件中的每一行记录
      .map(line => {
        val linearray = line.split(",")
        UserBehavior( linearray(0).toLong, linearray(1).toLong, linearray(2).toInt, linearray(3), linearray(4).toLong )
      })

      // 指定时间戳
      .assignAscendingTimestamps(_.timestamp * 1000)
      // 进行过滤,只获得pv行为的数据
      .filter(_.behavior == "pv")
      // 按照商品id进行分流
      .keyBy("itemId")
      // 滑动窗口
      .timeWindow(Time.hours(1), Time.minutes(5))
      // 预聚合,第一个参数:求和,第二个参数:每个商品在每个窗口的点击量
      .aggregate( new CountAgg(), new WindowResultFunction() )
      // 不同商品但同一窗口的数据在一起
      .keyBy("windowEnd")
      // 在自定义函数中进行业务逻辑处理
      .process( new TopNHotItems(3))
      .print()


    env.execute("Hot Items Job")
  }

  // 统计出总数
  class CountAgg extends AggregateFunction[UserBehavior, Long, Long]{
    override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1

    override def createAccumulator(): Long = 0L

    override def getResult(accumulator: Long): Long = accumulator

    override def merge(a: Long, b: Long): Long = a + b
  }

  // 获得某个商品在一个窗口中的总数
  class WindowResultFunction extends WindowFunction[Long, ItemViewCount, Tuple, TimeWindow]{
    override def apply(key: Tuple, window: TimeWindow, input: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
      val itemId: Long = key.asInstanceOf[Tuple1[Long]].f0
      val count = input.iterator.next()
      out.collect(ItemViewCount(itemId, window.getEnd, count))
    }
  }

  // 求某个窗口中前N名的热门点击商品
  class TopNHotItems(topSize: Int) extends KeyedProcessFunction[Tuple, ItemViewCount, String]{

    // ListState用于保存状态
    private var itemState: ListState[ItemViewCount] = _

    override def open(parameters: Configuration): Unit = {
      super.open(parameters)

      val itemStateDesc = new ListStateDescriptor[ItemViewCount]("itemState", classOf[ItemViewCount])
      // 创建ListState对象
      itemState = getRuntimeContext.getListState(itemStateDesc)
    }

    // 注册定时器
    override def processElement(i: ItemViewCount, context: KeyedProcessFunction[Tuple, ItemViewCount, String]#Context, collector: Collector[String]): Unit = {
      // 添加本窗口中的所有数据
      itemState.add(i)

      // windowEnd+1ms作为关闭窗口的触发时间,1ms
      context.timerService.registerEventTimeTimer( i.windowEnd + 1 )
    }

    // 到了windowEnd+1时间后,执行onTimer方法
    override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Tuple, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {

      val allItems: ListBuffer[ItemViewCount] = ListBuffer()
      import  scala.collection.JavaConversions._

      // 将ListStae对象的数据存入到ListBuffer对象
      for(item <- itemState.get){
        allItems += item
      }

      // 清除状态中的数据
      itemState.clear()

      // 按count进行降序排序,获得前3名的数据
      val sortedItems = allItems.sortBy(_.count)(Ordering.Long.reverse).take(topSize)

      // 格式化,打印
      val result: StringBuilder = new StringBuilder
      result.append("====================================\n")
      result.append("时间:").append(new Timestamp(timestamp - 1)).append("\n")

      for( i <- sortedItems.indices ){
        val currentItem: ItemViewCount = sortedItems(i)

        result.append("No").append(i+1).append(":")
          .append("  商品ID=").append(currentItem.itemId)
          .append("  浏览量=").append(currentItem.count).append("\n")
      }
      result.append("====================================\n\n")

      // 控制输出频率,模拟实时滚动
      Thread.sleep(1000)
      out.collect(result.toString)
    }
  }
}
## KafkaProducer.scala

import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}

object KafkaProducer {

  def main(args: Array[String]): Unit = {
    writeToKafka("hotitems")
  }

  def writeToKafka(topic: String): Unit = {
    val props = new Properties()
    props.put("bootstrap.servers", "localhost:9092")
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    val producer = new KafkaProducer[String, String](props)
    val bufferedSource = io.Source.fromFile("D:\\Projects\\BigData\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")
    for (line <- bufferedSource.getLines) {
      val record = new ProducerRecord[String, String](topic, line)
      producer.send(record)
    }
    producer.close()
  }
}

6、登录失败检测

## LoginFailWithCep.scala

package com.gupao.LoginFailDetect

import org.apache.flink.cep.scala.CEP
import org.apache.flink.cep.scala.pattern.Pattern
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time

object LoginFailWithCep {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    val loginStream = env.fromCollection(List(
      LoginEvent(1, "192.168.0.1", "fail", 1558430832),
      LoginEvent(1, "192.168.0.2", "fail", 1558430843),
      LoginEvent(1, "192.168.0.3", "fail", 1558430844),
      LoginEvent(2, "192.168.0.3", "fail", 1558430845),
      LoginEvent(2, "192.168.10.10", "success", 1558430845)
    ))
      .assignAscendingTimestamps(_.eventTime * 1000)
      .keyBy(_.userId)


    // 定义匹配模式(规则)
    // begin:第一次,where:出现fail
    // next:紧接着第二次出现fail
    // times:出现一次
    // within:表示是在2秒内
    val loginFailPattern = Pattern.begin[LoginEvent]("begin").where(_.eventType == "fail")
      .next("next").where(_.eventType == "fail")
      .times(1)
      .within(Time.seconds(2))

    // 获得模式流
    val patternStream = CEP.pattern( loginStream, loginFailPattern )

    import scala.collection.Map

    // select方法分别获得符合匹配的begin和next值
    // 输出符合需求的结果
    val loginFailDataStream = patternStream.select(
      ( pattern: Map[String, Iterable[LoginEvent]] ) =>{
        val begin = pattern.getOrElse("begin", null).iterator.next()
        val next = pattern.getOrElse("next", null).iterator.next()
        (next.userId, begin.ip, next.ip, next.eventType)
      }
    )
      .print()

    env.execute("Login Fail Detect Job")
  }
}
## LoginFail.scala

package com.gupao.LoginFailDetect

import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
import scala.collection.mutable.ListBuffer

case class LoginEvent( userId: Long, ip: String, eventType: String, eventTime: Long )

object LoginFail {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    val loginStream = env.fromCollection(List(
      LoginEvent(1, "192.168.0.1", "fail", 1558430832),
      LoginEvent(1, "192.168.0.2", "fail", 1558430843),
      LoginEvent(1, "192.168.0.3", "fail", 1558430844),
      LoginEvent(2, "192.168.0.3", "fail", 1558430845),
      LoginEvent(2, "192.168.10.10", "success", 1558430845)
    ))
      .assignAscendingTimestamps(_.eventTime * 1000)
      .filter(_.eventType == "fail")
      .keyBy(_.userId)
      .process( new MatchFunction() )
      .print()

    env.execute("Login Fail Detect Job")
  }


  class MatchFunction extends KeyedProcessFunction[Long, LoginEvent, LoginEvent]{

    lazy val loginState: ListState[LoginEvent] = getRuntimeContext.getListState( new ListStateDescriptor[LoginEvent]("loginState", classOf[LoginEvent]) )

    override def processElement(i: LoginEvent, context: KeyedProcessFunction[Long, LoginEvent, LoginEvent]#Context, collector: Collector[LoginEvent]): Unit = {
      loginState.add(i)
      context.timerService().registerEventTimeTimer( i.eventTime * 1000 + 2 * 1000 )
    }


    override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, LoginEvent, LoginEvent]#OnTimerContext, out: Collector[LoginEvent]): Unit = {
      val allLogins: ListBuffer[LoginEvent] = ListBuffer()
      import scala.collection.JavaConversions._
      for( login <- loginState.get() ){
        allLogins += login
      }
      loginState.clear()

      if(allLogins.length > 1){
        out.collect(allLogins.head)
      }
    }
  }
}

7、网络流量分析

## NetworkTraffic.scala

package com.gupao.NetWorkTrafficAnalysis

import java.sql.Timestamp
import java.text.SimpleDateFormat

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer


// 输入数据类型
case class ApacheLogEvent( ip: String, userId: String, eventTime: Long, method: String, url: String )

// 输出数据类型
case class UrlViewCount( url: String, windowEnd: Long, count: Long )

object NetworkTraffic {

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    val stream = env
      .readTextFile("D:\\MyIdeaProjects\\UserBehaviorAnalysis\\NetworkTrafficAnalysis\\src\\main\\resources\\apache.log")
      .map(line =>{
        val linearray = line.split(" ")

        val simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
        // 转换为时间戳
        val timestamp = simpleDateFormat.parse(linearray(3)).getTime
        ApacheLogEvent( linearray(0), linearray(1), timestamp, linearray(5), linearray(6) )
      })

      // 设置时间戳和watermark
      .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[ApacheLogEvent](Time.seconds(10)) {
          override def extractTimestamp(t: ApacheLogEvent): Long = {
            t.eventTime
      }
    })
      .filter(_.method == "GET")
      .keyBy(_.url)
      .timeWindow(Time.minutes(1), Time.seconds(5))
      .aggregate( new CountAgg(), new WindowResultFunction() )
      .keyBy(_.windowEnd)
      .process( new TopNHotUrls(5) )
      .print()

    env.execute("Network Traffic Analysis Job")
  }

  class CountAgg extends AggregateFunction[ApacheLogEvent, Long, Long]{
    override def add(value: ApacheLogEvent, accumulator: Long): Long = accumulator + 1
    override def createAccumulator(): Long = 0L
    override def getResult(accumulator: Long): Long = accumulator
    override def merge(a: Long, b: Long): Long = a + b
  }

  class WindowResultFunction extends WindowFunction[Long, UrlViewCount, String, TimeWindow]{
    override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
      val url: String = key
      val count = input.iterator.next()
      out.collect(UrlViewCount(url, window.getEnd, count))
    }
  }


  class TopNHotUrls(topSize: Int) extends KeyedProcessFunction[Long, UrlViewCount, String]{

    lazy val urlState: ListState[UrlViewCount] = getRuntimeContext.getListState( new ListStateDescriptor[UrlViewCount]( "urlState", classOf[UrlViewCount] ) )

    override def processElement(i: UrlViewCount, context: KeyedProcessFunction[Long, UrlViewCount, String]#Context, collector: Collector[String]): Unit = {

      urlState.add(i)

      // 设置超过watermark就触发计算
      context.timerService().registerEventTimeTimer(i.windowEnd + 10 * 1000)
    }


    override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {

      val allUrlViews: ListBuffer[UrlViewCount] = ListBuffer()
      import scala.collection.JavaConversions._
      for( urlView <- urlState.get() ){
        allUrlViews += urlView
      }

      urlState.clear()

      val sortedUrlViews = allUrlViews.sortBy(_.count)(Ordering.Long.reverse).take(topSize)

      var result: StringBuilder = new StringBuilder
      result.append("====================================\n")
      result.append("时间: ").append(new Timestamp(timestamp - 10 * 1000)).append("\n")

      for (i <- sortedUrlViews.indices) {
        val currentUrlView: UrlViewCount = sortedUrlViews(i)

        result.append("No").append(i+1).append(":")
          .append("  URL=").append(currentUrlView.url)
          .append("  流量=").append(currentUrlView.count).append("\n")
      }
      result.append("====================================\n\n")

      Thread.sleep(1000)
      out.collect(result.toString())

    }
  }
}

8、订单超时检测

## OrderTimeout.scala

package com.gupao.OrderTimeoutDetect

import org.apache.flink.cep.scala.CEP
import org.apache.flink.cep.scala.pattern.Pattern
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time


// 订单事件流样例类
case class OrderEvent( orderId: Long, eventType: String, eventTime: Long )
// 输出结果的样例类
case class OrderResult( orderId: Long, eventType: String )

object OrderTimeout {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    val orderEventStream = env.fromCollection(List(
      OrderEvent(1, "create", 1558430842),
      OrderEvent(2, "create", 1558430843),
      OrderEvent(1, "pay", 1558436842),
      OrderEvent(2, "pay", 1558430844)
    ))
      .assignAscendingTimestamps(_.eventTime * 1000)
      .keyBy(_.orderId)


    // followedBy:比较宽松的连续性
    // within:创建订单和支付之间在15分钟之内
    val orderPayPattern = Pattern.begin[OrderEvent]("begin").where(_.eventType == "create")
      .followedBy("follow").where(_.eventType == "pay")
      .within(Time.minutes(15))


    // 定义一个输出标签,用于标题侧输出流
    val orderTimoutOutput = OutputTag[OrderResult]("orderTimeout")

    // 获得模式流
    val patternStream = CEP.pattern( orderEventStream, orderPayPattern )

    import scala.collection.Map

    val completedResultDataStream = patternStream.select(orderTimoutOutput)(
      // orderTimoutOutput部分为侧输出流的超时部分
      ( pattern: Map[String, Iterable[OrderEvent]], timestamp: Long ) => {
        val timeoutOrderId = pattern.getOrElse("begin", null).iterator.next().orderId
        println(timestamp)
        OrderResult( timeoutOrderId, "timeout" )
      }
      // 主输出流:符合规则
    )(

      ( pattern: Map[String, Iterable[OrderEvent]] ) => {
        val payedOrderId = pattern.getOrElse("follow", null).iterator.next().orderId
        OrderResult( payedOrderId, "success" )
      }
    )
    // 打印匹配输出流即未超时部分
    completedResultDataStream.print()


    // 打印不匹配输出流即超时部分
    val timeoutResultDataStream = completedResultDataStream.getSideOutput(orderTimoutOutput)
    timeoutResultDataStream.print()

    env.execute("Order Timeout Detect Job")
  }
}
## SqlDemo.scala

package com.gupao.OrderTimeoutDetect

import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{Table, TableEnvironment}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.scala._


object SqlDemo {

  def main(args: Array[String]): Unit = {
    // 得到批环境
    val env = ExecutionEnvironment.getExecutionEnvironment


    val dataSet = env.fromElements((1,"小明",15,"男",1500),(2,"小王",45,"男",4000),(3,"小李",25,"女",800),(4,"小慧",35,"女",500))
    val dataSetGrade = env.fromElements((1,"语文",100),(2,"数学",80),(1,"外语",50) )

    // 得到Table环境
    val tableEnv = TableEnvironment.getTableEnvironment(env)
    // 注册table,表名是user1和grade,也就是用户表和成绩表,
    // 用户表的字段名分别是第三个参数中的id,'name,'age,'sex,'salary,也就是用户id、用户姓名、用户性别、用户薪资,
    // 成绩表的字段是'userId,'name,'fraction,也就是用户id、科目名、分数

    tableEnv.registerDataSet("user1",dataSet,'id,'name,'age,'sex,'salary)
    tableEnv.registerDataSet("grade",dataSetGrade,'userId,'name,'fraction)
    // 我们这里为了展示flink的sql应用,获得的是一些getSideOutput测试数据,
    // 在生产环境中,更多的是从不同的文件、数据库以及消息系统进行读取,
    // 比如比较常见的从消息系统kafka中获得数据,
    // 然后registerTableSource注册输入表也就是源数据表,然后用registerTableSink来获得目标数据表,
    // 再用比如sqlUpdate方法,执行sql语句,
    // 类似insert  into  目标表  select 。。。 from 源表 where 。。。,来获得我们想要的数据,
    // 再对目标表进行任何的sql操作,就是类似下面的这些sql操作,


    // 查询
    tableEnv.sqlQuery(s"select name,age FROM user1")
      .first(100).print()

    // 给表起别名
    tableEnv.sqlQuery(s"select t1.name,t1.age FROM user1 as t1")
      .first(100).print()

    // 给列起别名
    tableEnv.sqlQuery(s"select name a,age as b FROM user1 ")
      .first(100).print()

    // 降序排序,取前3位
    tableEnv.sqlQuery(s"select name,age FROM user1  ORDER BY age desc LIMIT 3  ")
      .first(100).print()

    // where条件,取性别为女的
    tableEnv.sqlQuery(s"select name,age,sex FROM user1 where sex = '女'")
      .first(100).print()

    // 查询年龄范围
    tableEnv.sqlQuery(s"select name,age,sex FROM user1 where age between 20 and  35")
      .first(100).print()

    // 求薪平均年龄,其余如max、min、sum、count等汇总同理
    tableEnv.sqlQuery(s"select avg(age) FROM user1")
      .first(100).print()

    // 求女性和男性的平均工资
    tableEnv.sqlQuery(s"select sex,sum(salary) FROM user1 group by sex")
      .first(100).print()

    // 分组统计,哪个性别的总工资大于1500
    tableEnv.sqlQuery(s"select sex,sum(salary) FROM user1 group by sex having sum(salary) >1500")
      .first(100).print()

    // 去重
    tableEnv.sqlQuery("select distinct sex  FROM user1   ")
      .first(100).print()

    // 内连接,两个表,输出user1的所有字段,以及grade表中的科目名和分数
    // tableEnv.sqlQuery("select * FROM user1  INNER JOIN  grade on  `user`.id = grade.userId ")
    tableEnv.sqlQuery("select u.*,g.name,g.fraction FROM user1 u  INNER JOIN  grade g on  u.id = g.userId ")
      .first(100).print()
  }
}
已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 书香水墨 设计师:CSDN官方博客 返回首页