浏览代码

增加字段

xufei 2 年之前
父节点
当前提交
d1f2d68693

+ 471 - 0
src/main/scala/com/winhc/bigdata/spark/ng/judicial/JudicialCaseRelationAggsV4.scala

@@ -0,0 +1,471 @@
+package com.winhc.bigdata.spark.ng.judicial
+
+import com.winhc.bigdata.spark.udf._
+import com.winhc.bigdata.spark.utils.BaseUtil.isWindows
+import com.winhc.bigdata.spark.utils.{AsyncExtract, BaseUtil, LoggingUtils, SparkUtils}
+import org.apache.commons.lang3.StringUtils
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.mutable
+
+/**
+ * @Description:司法案件新版本聚合(迭代)
+ * @author π
+ * @date 2022/7/6 16:46
+ */
+
+case class args_case_v4(tableName: String = ""
+                        , rowkey: String = "rowkey"
+                        , cols_map: Map[String, String] = Map.empty
+                       )
+
+object args_case_v4 {
+  val tn_mapping = Map[String, String](
+    "wenshu_detail_v2" -> "0"
+    , "company_court_open_announcement" -> "1"
+    , "company_court_announcement" -> "2"
+    , "company_dishonest_info" -> "3"
+    , "company_send_announcement" -> "4"
+    , "company_zxr_restrict" -> "5"
+    , "company_zxr_final_case" -> "6"
+    , "company_zxr" -> "7"
+    , "company_court_register" -> "8"
+  )
+  val tab_args = Seq(
+    //文书(金额万元)
+    args_case_v4(tableName = "wenshu_detail_v2"
+      , cols_map = Map[String, String]("flag" -> "0", "case_stage" -> "case_stage(case_no)"
+        , "yg_name" -> "plaintiff_info", "bg_name" -> "defendant_info", "date" -> "judge_date"
+        , "detail_id" -> "rowkey", "case_amt" -> "case_amt", "judge_amt" -> "judge_amt", "exec_amt" -> "null"
+        , "data" -> "map('date',judge_date,'party_title',party_title,'case_end',case_end(judge_result),'case_reason_level4',case_reason_level4,'court_name',court_name,'court_province_code',court_province_code,'judge_date',judge_date,'judge_year',judge_year,'is_success',is_success,'flag','0' ,'plaintiff_info', plaintiff_info, 'defendant_info', defendant_info )"
+        , "all_name" -> "litigant_info"
+        , "detail_info" -> "to_json(named_struct('flag', '0', 'date', cast(judge_date as string), 'detail_id', rowkey, 'doc_type', doc_type, 'judge_result', judge_result))"
+      ))
+    //开庭公告
+    , args_case_v4(tableName = "company_court_open_announcement"
+      , cols_map = Map[String, String]("flag" -> "1", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court", "case_reason" -> "case_reason"
+        , "yg_name" -> "plaintiff_info", "bg_name" -> "defendant_info", "date" -> "start_date"
+        , "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "null"
+        , "data" -> "map('date',start_date)"
+        , "all_name" -> "litigant_info"
+        , "detail_info" -> "to_json(named_struct('flag', '1', 'date', cast(start_date as string), 'detail_id', rowkey, 'court', court, 'court_room',court_room))"
+      ))
+    //法院公告
+    , args_case_v4(tableName = "company_court_announcement"
+      , cols_map = Map[String, String]("flag" -> "2", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court_name", "case_reason" -> "null"
+        , "yg_name" -> "plaintiff_info", "bg_name" -> "litigant_info", "date" -> "concat_ws(' ',publish_date,'00:00:00')"
+        , "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "null"
+        , "data" -> "map('date',concat_ws(' ',publish_date,'00:00:00'))"
+        , "all_name" -> "null"
+        , "detail_info" -> "to_json(named_struct('flag', '2', 'date',concat_ws(' ',publish_date,'00:00:00'), 'detail_id', rowkey, 'announcement_type', announcement_type, 'court_name', court_name))"
+      ))
+    //失信人
+    , args_case_v4(tableName = "company_dishonest_info"
+      , cols_map = Map[String, String]("flag" -> "3", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court", "case_reason" -> "null"
+        , "yg_name" -> "null", "bg_name" -> " to_json(array(named_struct('litigant_id',COALESCE(keyno,''),'name',name)))", "date" -> "pub_date"
+        , "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "null"
+        , "data" -> "map('date',reg_time,'case_end',if(deleted = 1,'1',null) )"
+        , "all_name" -> "null"
+        , "detail_info" -> "to_json(named_struct('flag', '3', 'date', cast(pub_date as string), 'detail_id', rowkey, 'name', array(named_struct('litigant_id',COALESCE(keyno,''),'name',name)), 'performance',  performance, 'action_content', action_content ))"
+      ))
+    //送达公告
+    , args_case_v4(tableName = "company_send_announcement"
+      , cols_map = Map[String, String]("flag" -> "4", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court", "case_reason" -> "case_reason"
+        , "yg_name" -> "plaintiff_info", "bg_name" -> "defendant_info", "date" -> "start_date"
+        , "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "null"
+        , "data" -> "map('date',start_date)"
+        , "all_name" -> "litigant_info"
+        , "detail_info" -> "to_json(named_struct('flag', '4', 'date', cast(start_date as string), 'detail_id', rowkey, 'defendant_info', json_array(defendant_info), 'plaintiff_info', json_array(plaintiff_info)))"
+      ))
+    //限高 //todo 有公司取公司,其次取人
+    , args_case_v4(tableName = "company_zxr_restrict"
+      , cols_map = Map[String, String]("flag" -> "5", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court_name", "case_reason" -> "null", "yg_name" -> "null"
+        , "bg_name" -> "to_json(array(named_struct('litigant_id',if(length(company_name)  = 0 or company_name is NULL ,pid,company_id) ,'name', if(length(company_name)  = 0 or company_name is NULL ,person_name,company_name) )  ))"
+        , "date" -> "case_create_time", "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "null"
+        , "data" -> "map('date',case_create_time,'case_end',if(deleted = 1,'1',null) )"
+        , "all_name" -> "null"
+        , "detail_info" -> "to_json(named_struct('flag', '5', 'date', cast(case_create_time as string), 'detail_id', rowkey, 'person', array(named_struct('litigant_id',COALESCE(pid,''),'person_name',person_name)), 'company', array(named_struct('litigant_id', company_id, 'company_name',company_name)),'applicant_info', json_array(applicant_info) ))"
+      ))
+    //终本
+    , args_case_v4(tableName = "company_zxr_final_case"
+      , cols_map = Map[String, String]("flag" -> "6", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court_name", "case_reason" -> "null"
+        , "yg_name" -> "null", "bg_name" -> "to_json(array(named_struct('litigant_id',COALESCE(keyno,''),'name',name)))"
+        , "date" -> "case_create_time"
+        , "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "null"
+        , "data" -> "map('date',case_create_time,'case_end',if(deleted = 0,'1',null) )"
+        , "all_name" -> "null"
+        , "detail_info" -> "to_json(named_struct('flag', '6', 'date', cast(case_create_time as string), 'detail_id', rowkey, 'name', array(named_struct('litigant_id',COALESCE(keyno,''), 'name',name)), 'exec_amount', amt_div(exec_amount, 10000), 'no_exec_amount', amt_div(no_exec_amount, 10000) ))"
+      ))
+    //被执
+    , args_case_v4(tableName = "company_zxr"
+      , cols_map = Map[String, String]("flag" -> "7", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court", "case_reason" -> "null"
+        , "yg_name" -> "null", "bg_name" -> "to_json(array(named_struct('litigant_id',COALESCE(keyno,''),'name',name)))", "date" -> "case_create_time"
+        , "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "amt_div(exec_money,10000)"
+        , "data" -> "map('date', case_create_time, 'exec_info', to_json(array(named_struct('litigant_id',COALESCE(keyno,''),'name',name,'exec_money',amt_div(exec_money,10000),'date',case_create_time ,'flag','7' ))) )"
+        , "all_name" -> "null"
+        , "detail_info" -> "to_json(named_struct('flag', '7', 'date', cast(case_create_time as string), 'detail_id', rowkey, 'name', array(named_struct('litigant_id',COALESCE(keyno,''),'name',name)), 'exec_money', amt_div(exec_money,10000) ))"
+      ))
+    //立案信息
+    , args_case_v4(tableName = "company_court_register"
+      , cols_map = Map[String, String]("flag" -> "8", "title" -> "null", "case_type" -> "case_type(case_no)"
+        , "case_stage" -> "case_stage(case_no)", "court_name" -> "court", "case_reason" -> "case_reason"
+        , "yg_name" -> "plaintiff_info", "bg_name" -> "defendant_info", "date" -> "filing_date"
+        , "detail_id" -> "rowkey", "case_amt" -> "null", "judge_amt" -> "null", "exec_amt" -> "null"
+        , "data" -> "map('date',filing_date)"
+        , "all_name" -> "litigant_info"
+        , "detail_info" -> "to_json(named_struct('flag', '8', 'date', cast(filing_date as string), 'detail_id', rowkey, 'court', court, 'judge', judge))"
+      ))
+  )
+
+  def get_job_args(tn: String): args_case_v4 = {
+    tab_args.find(p => tn.equals(p.tableName)).getOrElse(throw new NullPointerException("tn is not fount"))
+  }
+
+  def get_job_args(): args_case_v4 = {
+    args_case_v4()
+  }
+}
+
+
+object JudicialCaseRelationAggsV4 {
+  def main(args: Array[String]): Unit = {
+    var project = ""
+    var tn = ""
+    var c = ""
+    if (args.length == 3) {
+      val Array(p1, p2, p3) = args
+      project = p1
+      tn = p2
+      c = p3
+    } else if (args.length == 2) {
+      val Array(p1, p2) = args
+      project = p1
+      c = p2
+    } else {
+      println("please check project tn c!")
+      sys.exit(-1)
+    }
+    println(
+      s"""
+         |project: $project
+         |tn: $tn
+         |c: $c
+         |""".stripMargin)
+
+    val config = mutable.Map(
+      "spark.hadoop.odps.project.name" -> s"$project",
+      "spark.hadoop.odps.spark.local.partition.amt" -> "10000"
+    )
+    val spark: SparkSession = SparkUtils.InitEnv(this.getClass.getSimpleName, config)
+    if (StringUtils.isBlank(tn)) {
+      tn = "wenshu_detail_v2"
+    }
+    if ("all".equals(tn)) {
+      val seq = args_case_v4.tab_args.map(x => {
+        (x.tableName, () => {
+          run(project, x.tableName, c, spark)
+          true
+        })
+      })
+      AsyncExtract.startAndWait(spark, seq)
+    } else if (tn.contains(",")) {
+      val arr = tn.split(",", -1)
+      val seq = args_case_v4.tab_args.map(_.tableName).filter(arr.contains(_)).map(x => {
+        (x, () => {
+          run(project, x, c, spark)
+          true
+        })
+      })
+      AsyncExtract.startAndWait(spark, seq)
+    } else {
+      run(project, tn, c, spark)
+    }
+    spark.stop()
+  }
+
+  private def run(project: String, tn: String, c: String, spark: SparkSession) = {
+    val r = JudicialCaseRelationAggsV4(spark, project, args_case_v4.get_job_args(tn))
+    c match {
+      case "pre_calc" => r.pre_calc()
+      case "calc" => r.calc()
+      case _ => {
+        println("not fun to run !")
+        sys.exit(-1)
+      }
+    }
+  }
+}
+
+case class JudicialCaseRelationAggsV4(s: SparkSession, project: String, args_case_v4: args_case_v4
+                                     ) extends LoggingUtils with CompanyMapping with BaseFunc with CourtRank {
+  override protected val spark: SparkSession = s
+
+  //预处理表
+  val ads_judicial_case_relation_pre = s" $project.ads_judicial_case_relation_pre_v9_dev"
+  //替换id表
+  val ads_judicial_case_relation_id = s" $project.ads_judicial_case_relation_id_v9_dev"
+  //主表
+  val ads_judicial_case_relation_r1 = s" $project.ads_judicial_case_relation_r1_dev"
+  //明细表(增强)
+  val ads_judicial_case_relation_r3 = s" $project.ads_judicial_case_relation_r3_dev"
+
+
+  val update = s"update"
+  val incr = s"incr"
+  val init_ds = "20220818"
+
+  private val cols_map: Map[String, String] = args_case_v4.cols_map
+  private val rowkey: String = args_case_v4.rowkey
+  private val tableName: String = args_case_v4.tableName
+
+  val ads_table = s" $project.ads_$tableName" + "_v9"
+  val inc_ads_table = s" $project.inc_ads_$tableName" + "_v9"
+
+
+  val pre_cols = getColumns(ads_judicial_case_relation_pre).diff(Seq("ds", "tn"))
+  var last_ds = init_ds
+  val calc_ds = init_ds
+
+  //  if (calc_ds.equals(last_ds)) {
+  //    last_ds = BaseUtil.getSecondPartion(ads_judicial_case_relation_pre, tableName, spark)
+  //  }
+  val is_incr = false
+
+  val cols = pre_cols.map(c => {
+    if (cols_map.contains(c)) {
+      s"${cols_map(c)} as $c"
+    } else c
+  })
+
+  case_no_trim_udf_v2()
+  prepareFunctions(spark)
+
+  val sort = get_partition_order_by()
+
+  def pre_calc(): Unit = {
+    var all_sql = ""
+    if (!is_incr) {
+      all_sql =
+        s"""
+           |SELECT  *
+           |FROM    $ads_table
+           |WHERE   ${if (is_incr) "ds = -1" else "ds > 0"}
+           |UNION ALL
+           |""".stripMargin
+    }
+
+    //裁判文书
+    sql(
+      s"""
+         |INSERT ${if (isWindows) "INTO" else "OVERWRITE"} TABLE $ads_judicial_case_relation_pre PARTITION(ds='$calc_ds',tn='$tableName')
+         |SELECT
+         |${pre_cols.mkString(",")}
+         |from    (
+         |            SELECT  ${cols.mkString(",")}
+         |                    ,ROW_NUMBER() OVER(PARTITION BY $rowkey ORDER BY $sort) AS num
+         |            from    (
+         |                       $all_sql
+         |                        SELECT  *
+         |                        FROM    $inc_ads_table
+         |                        WHERE    ${if (is_incr) s"ds > $last_ds" else "ds > 0"}
+         |                    )
+         |        )
+         |WHERE   num = 1
+         |${if (isWindows) "LIMIT 1000" else ""}
+         |""".stripMargin).show(100, false)
+
+    //分区不存在,插入空分区
+    addEmptyPartitionOrSkipPlus(ads_judicial_case_relation_pre, calc_ds, tableName)
+  }
+
+
+  def calc(): Unit = {
+    prepareFunctions(spark)
+    case_no_trim_udf_v2()
+    registerCourtRank()
+    spark.udf.register("name_aggs", new NameAggsPlusV2(1000))
+    spark.udf.register("case_reason", new CaseReasonAggs(1000))
+    spark.udf.register("all_name_plus_v2", new AllNamePlusV2(1000))
+    spark.udf.register("case_amt_plus_v2", new CaseAmtAggsPlusV2(1000))
+    spark.udf.register("company_case_aggs", new CompanyCaseAggs(1000))
+
+
+    //明细表
+    sql(
+      s"""
+         |INSERT ${if (isWindows) "INTO" else "OVERWRITE"} TABLE $ads_judicial_case_relation_r3 PARTITION(ds='$calc_ds')
+         |SELECT
+         |    id,
+         |    judicase_id,
+         |    title       ,
+         |    case_type   ,
+         |    case_reason ,
+         |    case_no     ,
+         |    court_name  ,
+         |    case_stage  ,
+         |    lable       ,
+         |    detail      ,
+         |    name_aggs['yg_name'] yg_name,
+         |    name_aggs['bg_name'] bg_name,
+         |    last_date   ,
+         |    0 deleted   ,
+         |    all_name    ,
+         |    court_level,
+         |    case_amt,
+         |    judge_amt,
+         |    exec_info,
+         |    case_end,
+         |    now() as update_time
+         |    ,company_case
+         |FROM
+         |(
+         |SELECT  md5(concat_ws('',concat_ws('',judicase_id),CLEANUP(case_no))) id
+         |        ,judicase_id
+         |        ,max(title) title
+         |        ,case_type(max(case_no)) as case_type
+         |        ,case_reason(case_reason,date,flag) case_reason
+         |        ,case_no
+         |        ,concat_ws(',',collect_set(court_name)) court_name
+         |        ,case_stage(max(case_no)) as case_stage
+         |        ,trim_black(concat_ws(',', collect_set(lable))) lable
+         |        ,concat('[',concat_ws(',',collect_set(detail)),']') detail
+         |        ,max(date) last_date
+         |        ,name_aggs(yg_name,bg_name,flag,data['date'],detail_id) name_aggs
+         |        ,all_name_plus_v2(all_name) all_name
+         |        ,trim_black(concat_ws(',',collect_set(court_level))) court_level
+         |        ,max(case_amt) as case_amt
+         |        ,max(judge_amt) as judge_amt
+         |        ,case_amt_plus_v2(data['exec_info']) as exec_info
+         |        ,max(case_end) as case_end
+         |        -- ,company_case_aggs(data['judge_date'],data['case_reason_level4'],data['judge_year'],data['is_success'],data['plaintiff_info'],data['defendant_info'],data['flag']) as company_case
+         |        ,company_case_aggs(data['judge_date'] ,data['case_reason_level4'] ,data['judge_year'] ,data['is_success'] ,data['plaintiff_info'] ,data['defendant_info'] ,data['flag'] ) as company_case
+         |FROM    (
+         |        SELECT  a.*,court_level(court_name) court_level
+         |        FROM    (
+         |                   SELECT   judicase_id
+         |                           ,flag
+         |                           ,title
+         |                           ,case_type(case_no) case_type
+         |                           ,adjust_reason(case_reason) case_reason
+         |                           ,case_no_trim(case_no) as case_no
+         |                           ,court_name
+         |                           ,case_stage(case_no) case_stage
+         |                           ,case_label(flag) lable
+         |                           ,detail
+         |                           ,yg_name
+         |                           ,bg_name
+         |                           ,all_name
+         |                           ,date
+         |                           ,detail_id
+         |                           ,case_amt
+         |                           ,judge_amt
+         |                           ,tn
+         |                           ,data
+         |                           ,case_end
+         |                           ,coalesce(data['judge_date'],'') as judge_date
+         |                           ,coalesce(data['case_reason_level4'],'') as case_reason_level4
+         |                           ,coalesce(data['judge_year'],'') as judge_year
+         |                           ,coalesce(data['is_success'],'') as is_success
+         |                           ,coalesce(data['plaintiff_info'],'') as plaintiff_info
+         |                           ,coalesce(data['defendant_info'],'') as defendant_info
+         |                   FROM    $ads_judicial_case_relation_id
+         |                   WHERE   ds = '$calc_ds' AND length(case_label(flag)) > 0 AND  case_no_trim(case_no) is not null AND  date is not null
+         |                )a
+         |)
+         |GROUP BY judicase_id
+         |         ,case_no
+         |) x
+         |""".stripMargin).show(10, false)
+
+    //司法案件主表
+    sql(
+      s"""
+         |INSERT ${if (isWindows) "INTO" else "OVERWRITE"} TABLE $ads_judicial_case_relation_r1 PARTITION(ds='$calc_ds')
+         |SELECT
+         |    judicase_id,
+         |    title       ,
+         |    case_type   ,
+         |    case_reason ,
+         |    case_no     ,
+         |    court_name  ,
+         |    case_stage  ,
+         |    lable       ,
+         |    name_aggs['yg_name'] yg_name,
+         |    name_aggs['bg_name'] bg_name,
+         |    all_name,
+         |    case_info    ,
+         |    judge_info   ,
+         |    exec_info    ,
+         |    date         ,
+         |    court_level  ,
+         |    0 deleted    ,
+         |    case_end     ,
+         |    now() as update_time,
+         |    company_case
+         |FROM
+         |(
+         |SELECT  judicase_id
+         |        ,max(title) title
+         |        ,concat_ws(',',collect_set(case_type)) case_type
+         |        ,case_reason(case_reason,date,'0') case_reason
+         |        ,concat_ws(',',collect_set(case_no)) case_no
+         |        ,trim_black(concat_ws(',',collect_set(court_name))) court_name
+         |        ,max(last_stage) case_stage
+         |        ,trim_black(concat_ws(',', collect_set(lable)) ) lable
+         |        -- ,max(first_case_amt) case_amt
+         |        ,max(date) AS date
+         |        ,trim_black(concat_ws(',',collect_set(court_level))) court_level
+         |        ,name_aggs(yg_name,bg_name,'0',date,'0') name_aggs
+         |        ,all_name_plus_v2(all_name) all_name
+         |        ,amt_merge(concat_ws('&',collect_set(case_info))) case_info
+         |        ,amt_merge(concat_ws('&',collect_set(judge_info))) judge_info
+         |        ,case_amt_plus_v2(exec_info) as exec_info
+         |        ,max(case_end) as case_end
+         |        ,company_case_aggs(data['judge_date'] ,data['case_reason_level4'] ,data['judge_year'] ,data['is_success'] ,data['plain tiff_info'] ,data['defendant_info'] ,data['flag'] ) as company_case
+         |FROM    (
+         |        SELECT  a.*
+         |        FROM    (
+         |                   SELECT  judicase_id,title,case_type,case_reason,case_no,court_name,case_stage,lable,yg_name,bg_name,all_name,date,case_amt,judge_amt,exec_info,case_end
+         |                   ,court_level(court_name) court_level
+         |                   ,concat_ws('|',case_stage,coalesce(case_amt,0))  as case_info
+         |                   ,concat_ws('|',case_stage,coalesce(judge_amt,0)) as judge_info
+         |                   ,first_value(case_stage) OVER (PARTITION BY judicase_id ORDER BY date DESC ) AS last_stage
+         |                   ,company_case as data
+         |                   FROM    $ads_judicial_case_relation_r3
+         |                   WHERE   ds = '$calc_ds'
+         |                ) a
+         |        )
+         |GROUP BY judicase_id
+         |)x
+         |""".stripMargin).show(20, false)
+
+    //分区不存在,插入空分区
+    addEmptyPartitionOrSkip(ads_judicial_case_relation_r1, calc_ds)
+    addEmptyPartitionOrSkip(ads_judicial_case_relation_r3, calc_ds)
+  }
+
+  private def get_partition_order_by(): String = {
+    if (pre_cols.contains("update_time") || pre_cols.contains("update_date")) {
+      " ds DESC,update_time DESC "
+    } else {
+      " ds DESC "
+    }
+  }
+
+  def calc_last_ds(tabName: String, default: String = "0"): String = {
+    var d1 = getLastPartitionsOrElse(tabName, default)
+    val d2 = BaseUtil.getYesterday()
+    if (d1.equals(d2)) {
+      d1 = getSecondLastPartitionOrElse(tabName, default)
+    }
+    d1
+  }
+}
+

+ 124 - 0
src/main/scala/com/winhc/bigdata/spark/udf/CompanyCaseAggs.scala

@@ -0,0 +1,124 @@
+package com.winhc.bigdata.spark.udf
+
+import com.winhc.bigdata.spark.utils.BaseUtil.{json_array, list_json}
+import org.apache.commons.lang3.StringUtils
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
+import org.apache.spark.sql.types._
+
+/**
+ * @Description:company_case聚合
+ * @author π
+ * @date 2020/10/26 15:15
+ */
+
+class CompanyCaseAggs(max: Int) extends UserDefinedAggregateFunction {
+
+  val flags = Seq("0", "7")
+  val split = "\u0001"
+
+  override def inputSchema: StructType = StructType(Array[StructField](
+    StructField("judge_date", DataTypes.StringType)
+    , StructField("case_reason_level4", DataTypes.StringType)
+    , StructField("judge_year", DataTypes.StringType)
+    , StructField("is_success", DataTypes.StringType)
+    , StructField("plaintiff_info", DataTypes.StringType)
+    , StructField("defendant_info", DataTypes.StringType)
+    , StructField("flag", DataTypes.StringType)
+  ))
+
+  override def bufferSchema: StructType = StructType(
+    Array[StructField](
+      StructField("t1", DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType))
+    )
+  )
+
+  override def dataType: DataType = DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType)
+
+  override def deterministic: Boolean = true
+
+  override def initialize(buffer: MutableAggregationBuffer): Unit = {
+    buffer.update(0, Map[String, String]())
+  }
+
+  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
+    if (buffer.size >= max) {
+      return
+    }
+    if (input.size != 7) return
+    val judge_date = input.getAs[String](0)
+    val case_reason_level4 = input.getAs[String](1)
+    val judge_year = input.getAs[String](2)
+    val is_success = input.getAs[String](3)
+    val plaintiff_info = input.getAs[String](4)
+    val defendant_info = input.getAs[String](5)
+    val flag = input.getAs[String](6)
+
+    if (!flags.contains(flag)) {
+      return
+    }
+    val map0 = buffer.getMap[String, String](0).toMap
+    var map_new0 = scala.collection.mutable.Map[String, String](map0.toSeq: _*)
+
+    if (flag.equals("0")) {
+      if (StringUtils.isNotBlank(judge_date)) {
+        map_new0 ++= Map(s"$judge_date" -> s"$case_reason_level4$split$judge_year$split$is_success$split$plaintiff_info$split$defendant_info$split$flag")
+        buffer.update(0, map_new0)
+      }
+    } else if (flag.equals("7")) {
+      map_new0 ++= Map(s"" -> s"$case_reason_level4$split$judge_year$split$is_success$split$plaintiff_info$split$defendant_info$split$flag")
+      buffer.update(0, map_new0)
+    }
+
+  }
+
+  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
+    if (buffer1.size >= max) {
+      return
+    }
+    buffer1(0) = buffer1.getAs[Map[String, String]](0) ++ buffer2.getAs[Map[String, String]](0)
+  }
+
+  override def evaluate(buffer: Row): Any = {
+    val m0: Map[String, String] = buffer.getAs[Map[String, String]](0)
+
+    if (m0.nonEmpty) {
+      val keys = m0.keySet.filter(x => StringUtils.isNotBlank(x))
+      if (keys.nonEmpty) {
+        val minKey = keys.toSeq.min
+        val maxKey = keys.toSeq.max
+
+        val arr_min = m0.getOrElse(minKey, "").split(s"$split", -1)
+        val arr_max = m0.getOrElse(maxKey, "").split(s"$split", -1)
+
+        var is_success = arr_max(2)
+        var flag = "0"
+        if (m0.keySet.contains("")) {
+          is_success = "胜"
+          flag = "7"
+        }
+        val re = m0.filter(x => x._1.length > 0).map(y => {
+          val arr = y._2.split(s"$split", -1)
+          json_array(arr(3)) ++ json_array(arr(4))
+        }).reduce(_ ++ _)
+
+        return Map("judge_date" -> minKey
+          , "case_reason_level4" -> arr_min(0)
+          , "judge_year" -> arr_min(1)
+          , "is_success" -> is_success
+          , "plaintiff_info" -> list_json(re)
+          , "defendant_info" -> "[]"
+          , "flag" -> flag
+        )
+      }
+    }
+    Map("judge_date" -> ""
+      , "case_reason_level4" -> ""
+      , "judge_year" -> ""
+      , "is_success" -> ""
+      , "plaintiff_info" -> "[]"
+      , "defendant_info" -> "[]"
+      , "flag" -> ""
+    )
+  }
+}

文件差异内容过多而无法显示
+ 26 - 0
src/main/scala/com/winhc/bigdata/spark/utils/BaseUtil.scala