|
@@ -2,6 +2,7 @@ package com.winhc.bigdata.spark.utils
|
|
|
|
|
|
import java.util.Date
|
|
|
|
|
|
+import com.winhc.bigdata.spark.udf.CompanyMapping
|
|
|
import org.apache.commons.lang3.StringUtils
|
|
|
import org.apache.spark.sql.SparkSession
|
|
|
|
|
@@ -17,11 +18,14 @@ case class CompanyIncrForCidsUtils(s: SparkSession,
|
|
|
mainTableName: String, //主表名(不加前辍)
|
|
|
sublistTableName: String, //子表(不加前辍)
|
|
|
dupliCols: Seq[String] // 去重列
|
|
|
- ) extends LoggingUtils {
|
|
|
+ ) extends LoggingUtils with CompanyMapping{
|
|
|
@(transient@getter) val spark: SparkSession = s
|
|
|
|
|
|
def calc(): Unit = {
|
|
|
println(s"${this.getClass.getSimpleName} calc start! " + new Date().toString)
|
|
|
+
|
|
|
+ prepareFunctions(spark)
|
|
|
+
|
|
|
val inc_ods_company = s"${project}.inc_ods_company" //每日公司基本信息增量
|
|
|
val ads_company_tb = s"${project}.ads_$mainTableName" //存量ads主表数据
|
|
|
val ads_company_tb_list = s"${project}.ads_$sublistTableName" //存量子表数据 用于读取表字段
|
|
@@ -53,8 +57,11 @@ case class CompanyIncrForCidsUtils(s: SparkSession,
|
|
|
runDs = BaseUtil.atDaysAfter(1, lastDsIncAds)
|
|
|
}
|
|
|
|
|
|
+ val cols_md5 = dupliCols.filter(!_.equals("new_cid"))
|
|
|
+
|
|
|
println(
|
|
|
s"""
|
|
|
+ |cols_md5:$cols_md5
|
|
|
|remainDs:$remainDs
|
|
|
|lastDsIncOds:$lastDsIncOds
|
|
|
|lastDsIncAds:$lastDsIncAds
|
|
@@ -82,7 +89,7 @@ case class CompanyIncrForCidsUtils(s: SparkSession,
|
|
|
|AND cid IS NOT NULL
|
|
|
|AND current_cid IS NOT NULL
|
|
|
|GROUP BY cid,current_cid
|
|
|
- |""".stripMargin).cache().createOrReplaceTempView("mapping")
|
|
|
+ |""".stripMargin).createOrReplaceTempView("mapping")
|
|
|
|
|
|
//增量打平
|
|
|
sql(
|
|
@@ -99,56 +106,60 @@ case class CompanyIncrForCidsUtils(s: SparkSession,
|
|
|
//替换cid,去重,复制老数据
|
|
|
val df1 = sql(
|
|
|
s"""
|
|
|
- |INSERT OVERWRITE TABLE $inc_ads_company_tb_list PARTITION(ds='$lastDsIncOds')
|
|
|
- |SELECT CONCAT_WS('_',new_cid,id) AS rowkey
|
|
|
- | ,"0" as flag
|
|
|
- | ,CAST(new_cid as string) AS new_cid
|
|
|
+ INSERT OVERWRITE TABLE $inc_ads_company_tb_list PARTITION(ds='$lastDsIncOds')
|
|
|
+ |SELECT CONCAT_WS('_',new_cid,md5(cleanup(CONCAT_WS('',${cols_md5.mkString(",")})))) AS rowkey
|
|
|
+ | ,flag
|
|
|
+ | ,new_cid
|
|
|
| ,${sublistTableFieldName.mkString(",")}
|
|
|
|FROM (
|
|
|
- | SELECT
|
|
|
- | *
|
|
|
- | ,ROW_NUMBER() OVER (PARTITION BY ${dupliCols.mkString(",")} ORDER BY update_time DESC ) num
|
|
|
- | FROM (
|
|
|
- | SELECT
|
|
|
- | c.*
|
|
|
- | ,coalesce(d.new_cid,c.cid) AS new_cid
|
|
|
- | FROM incr_tb c
|
|
|
- | LEFT JOIN mapping d
|
|
|
- | ON c.cid = d.cid
|
|
|
- | ) e
|
|
|
- | ) f
|
|
|
- |WHERE num =1
|
|
|
- |UNION ALL
|
|
|
- |SELECT CONCAT_WS('_',new_cid,id) AS rowkey
|
|
|
- | ,"1" as flag
|
|
|
- | ,CAST(new_cid as string) AS new_cid
|
|
|
- | ,${sublistTableFieldName.mkString(",")}
|
|
|
- |FROM (
|
|
|
- | SELECT a.new_cid
|
|
|
- | ,${columns.mkString(",")}
|
|
|
- | ,ROW_NUMBER() OVER (PARTITION BY ${dupliCols.mkString(",")} ORDER BY update_time DESC ) num
|
|
|
- | FROM mapping a
|
|
|
- | JOIN (
|
|
|
- | SELECT new_cid AS cid
|
|
|
- | ,${columns.mkString(",")}
|
|
|
- | FROM ${inc_ads_company_tb_list}
|
|
|
- | WHERE ds >= ${runDs}
|
|
|
- | UNION ALL
|
|
|
- | SELECT new_cid AS cid
|
|
|
+ | SELECT "0" AS flag
|
|
|
+ | ,CAST(new_cid AS STRING) AS new_cid
|
|
|
+ | ,${sublistTableFieldName.mkString(",")}
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY cleanup(CONCAT_WS('',${dupliCols.mkString(",")})) ORDER BY update_time DESC ) num
|
|
|
+ | FROM (
|
|
|
+ | SELECT *
|
|
|
+ | FROM (
|
|
|
+ | SELECT c.*
|
|
|
+ | ,coalesce(d.new_cid,c.cid) AS new_cid
|
|
|
+ | FROM incr_tb c
|
|
|
+ | LEFT JOIN mapping d
|
|
|
+ | ON c.cid = d.cid
|
|
|
+ | ) e
|
|
|
+ | ) f
|
|
|
+ | UNION ALL
|
|
|
+ | SELECT "1" AS flag
|
|
|
+ | ,CAST(new_cid AS STRING) AS new_cid
|
|
|
+ | ,${sublistTableFieldName.mkString(",")}
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY cleanup(CONCAT_WS('',${dupliCols.mkString(",")})) ORDER BY update_time DESC ) num
|
|
|
+ | FROM (
|
|
|
+ | SELECT a.new_cid
|
|
|
| ,${columns.mkString(",")}
|
|
|
- | FROM ${ads_company_tb_list}
|
|
|
- | WHERE ds >= ${remainDs}
|
|
|
- | ) b
|
|
|
- | ON a.cid = b.cid
|
|
|
- | ) c
|
|
|
+ | FROM mapping a
|
|
|
+ | JOIN (
|
|
|
+ | SELECT new_cid AS cid
|
|
|
+ | ,${columns.mkString(",")}
|
|
|
+ | FROM ${inc_ads_company_tb_list}
|
|
|
+ | WHERE ds >= ${runDs}
|
|
|
+ | UNION ALL
|
|
|
+ | SELECT new_cid AS cid
|
|
|
+ | ,${columns.mkString(",")}
|
|
|
+ | FROM ${ads_company_tb_list}
|
|
|
+ | WHERE ds >= ${remainDs}
|
|
|
+ | ) b
|
|
|
+ | ON a.cid = b.cid
|
|
|
+ | ) c
|
|
|
+ | ) e
|
|
|
|WHERE num = 1
|
|
|
+ |AND cleanup(CONCAT_WS('',${cols_md5.mkString(",")})) IS NOT NULL
|
|
|
+ |AND trim(cleanup(CONCAT_WS('',${cols_md5.mkString(",")}))) <> ''
|
|
|
|""".stripMargin)
|
|
|
|
|
|
//主表按照id去重落库
|
|
|
sql(
|
|
|
s"""
|
|
|
|INSERT OVERWRITE TABLE $inc_ads_company_tb PARTITION(ds='$lastDsIncOds')
|
|
|
- |SELECT cids,${columns.mkString(",")}
|
|
|
+ |SELECT md5(cleanup(CONCAT_WS('',${cols_md5.mkString(",")}))) AS rowkey,
|
|
|
+ | cids,${columns.mkString(",")}
|
|
|
|FROM (
|
|
|
| SELECT cids,${columns.mkString(",")}
|
|
|
| ,ROW_NUMBER() OVER (PARTITION BY id ORDER BY update_time DESC ) num
|
|
@@ -168,7 +179,7 @@ case class CompanyIncrForCidsUtils(s: SparkSession,
|
|
|
inc_ads_company_tb_list,
|
|
|
sublistTableName,
|
|
|
lastDsIncOds,
|
|
|
- Seq("new_cid","id")
|
|
|
+ s"CONCAT_WS('_',new_cid,md5(cleanup(CONCAT_WS('',${cols_md5.mkString(",")}))))"
|
|
|
).syn()
|
|
|
|
|
|
//同步增量主表数据
|
|
@@ -179,7 +190,7 @@ case class CompanyIncrForCidsUtils(s: SparkSession,
|
|
|
inc_ads_company_tb,
|
|
|
mainTableName,
|
|
|
lastDsIncOds,
|
|
|
- Seq("id")
|
|
|
+ s"md5(cleanup(CONCAT_WS('',${cols_md5.mkString(",")})))"
|
|
|
).syn()
|
|
|
|
|
|
println(s"${this.getClass.getSimpleName} calc end! " + new Date().toString)
|