|
@@ -0,0 +1,123 @@
|
|
|
+package com.winhc.bigdata.spark.utils
|
|
|
+
|
|
|
+import java.util.Date
|
|
|
+
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+
|
|
|
+import scala.annotation.meta.getter
|
|
|
+
|
|
|
+/**
|
|
|
+ * π
|
|
|
+ * 增量cids转换
|
|
|
+ */
|
|
|
+
|
|
|
+case class CompanyIncrForCidsUtils(s: SparkSession,
|
|
|
+ inc_ods_company: String,//每日公司基本信息增量
|
|
|
+ ads_company_tb: String,//存量维度数据
|
|
|
+ inc_ods_company_tb: String,//增量维度ods数据
|
|
|
+ target_inc_ads_company_tb: String,//维度主表信息
|
|
|
+ target_inc_ads_company_tb_list: String,//维度列表信息
|
|
|
+ cols: Seq[String]) extends LoggingUtils {
|
|
|
+ @(transient@getter) val spark: SparkSession = s
|
|
|
+
|
|
|
+ def calc(): Unit = {
|
|
|
+ println(s"${this.getClass.getSimpleName} calc start! " + new Date().toString)
|
|
|
+
|
|
|
+ val firstDs = BaseUtil.getFirstPartion("winhc_eci_dev.inc_ods_company", spark)
|
|
|
+
|
|
|
+ //table字段
|
|
|
+ val columns: Seq[String] = spark.table(ads_company_tb).schema.map(_.name).filter(s => {
|
|
|
+ !s.equals("ds") && !s.equals("cid") && !s.equals("new_cid") && !s.equals("rowkey") && !s.equals("cids")
|
|
|
+ })
|
|
|
+
|
|
|
+ //mapping 映射关系
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |SELECT cid,current_cid as new_cid
|
|
|
+ |FROM ${inc_ods_company}
|
|
|
+ |WHERE ds >= ${firstDs}
|
|
|
+ |AND cid IS NOT NULL
|
|
|
+ |AND current_cid IS NOT NULL
|
|
|
+ |GROUP BY cid,current_cid
|
|
|
+ |""".stripMargin).cache().createOrReplaceTempView("mapping")
|
|
|
+
|
|
|
+ //增量打平
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |SELECT *
|
|
|
+ |FROM ${inc_ods_company_tb} a
|
|
|
+ |LATERAL VIEW explode(split(cids,';')) b AS cid
|
|
|
+ |WHERE ds >= ${firstDs}
|
|
|
+ |AND cids IS NOT NULL
|
|
|
+ |AND trim(cids) <> ''
|
|
|
+ |""".stripMargin).createOrReplaceTempView("incr_tb")
|
|
|
+
|
|
|
+
|
|
|
+ //替换cid,去重,复制老数据
|
|
|
+ val df1 = sql(
|
|
|
+ s"""
|
|
|
+ |INSERT OVERWRITE TABLE $target_inc_ads_company_tb_list PARTITION(ds='$firstDs')
|
|
|
+ |SELECT CONCAT_WS('_',new_cid,id) AS rowkey
|
|
|
+ | ,"0" as flag
|
|
|
+ | ,CAST(new_cid as string) AS new_cid
|
|
|
+ | ,${columns.mkString(",")}
|
|
|
+ |FROM (
|
|
|
+ | SELECT
|
|
|
+ | *
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY ${cols.mkString(",")} ORDER BY update_time DESC ) num
|
|
|
+ | FROM (
|
|
|
+ | SELECT
|
|
|
+ | c.*
|
|
|
+ | ,coalesce(d.new_cid,c.cid) AS new_cid
|
|
|
+ | FROM incr_tb c
|
|
|
+ | LEFT JOIN mapping d
|
|
|
+ | ON c.cid = d.cid
|
|
|
+ | ) e
|
|
|
+ | ) f
|
|
|
+ |WHERE num =1
|
|
|
+ |UNION ALL
|
|
|
+ |SELECT CONCAT_WS('_',new_cid,id) AS rowkey
|
|
|
+ | ,"1" as flag
|
|
|
+ | ,CAST(new_cid as string) AS new_cid
|
|
|
+ | ,${columns.mkString(",")}
|
|
|
+ |FROM (
|
|
|
+ | SELECT a.new_cid
|
|
|
+ | ,${columns.mkString(",")}
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY ${cols.mkString(",")} ORDER BY update_time DESC ) num
|
|
|
+ | FROM mapping a
|
|
|
+ | JOIN (
|
|
|
+ | SELECT new_cid AS cid
|
|
|
+ | ,${columns.mkString(",")}
|
|
|
+ | FROM ${target_inc_ads_company_tb_list}
|
|
|
+ | WHERE ds >= ${firstDs}
|
|
|
+ | UNION ALL
|
|
|
+ | SELECT new_cid AS cid
|
|
|
+ | ,${columns.mkString(",")}
|
|
|
+ | FROM ${ads_company_tb}
|
|
|
+ | WHERE ds >= ${firstDs}
|
|
|
+ | ) b
|
|
|
+ | ON a.cid = b.cid
|
|
|
+ | ) c
|
|
|
+ |WHERE num = 1
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+
|
|
|
+ //主表按照id去重落库
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |INSERT OVERWRITE TABLE $target_inc_ads_company_tb PARTITION(ds='$firstDs')
|
|
|
+ |SELECT ${columns.mkString(",")}
|
|
|
+ |FROM (
|
|
|
+ | SELECT ${columns.mkString(",")}
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY id ORDER BY update_time DESC ) num
|
|
|
+ | FROM ${inc_ods_company_tb}
|
|
|
+ | WHERE ds >= ${firstDs}
|
|
|
+ | AND cids IS NOT NULL
|
|
|
+ | AND trim(cids) <> ''
|
|
|
+ | ) a
|
|
|
+ |WHERE num = 1
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+ println(s"${this.getClass.getSimpleName} calc end! " + new Date().toString)
|
|
|
+ }
|
|
|
+}
|