|
@@ -0,0 +1,263 @@
|
|
|
+package com.winhc.bigdata.spark.jobs
|
|
|
+
|
|
|
+import com.winhc.bigdata.spark.config.EsConfig
|
|
|
+import com.winhc.bigdata.spark.udf.{BaseFunc, CompanyStaffAggs}
|
|
|
+import com.winhc.bigdata.spark.utils.{LoggingUtils, SparkUtils}
|
|
|
+import org.apache.spark.internal.Logging
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+
|
|
|
+import scala.annotation.meta.getter
|
|
|
+import scala.collection.mutable
|
|
|
+
|
|
|
+/**
|
|
|
+ * @Author: XuJiakai
|
|
|
+ * @Date: 2020/8/3 14:48
|
|
|
+ * @Description:
|
|
|
+ */
|
|
|
+object company_staff {
|
|
|
+
|
|
|
+
|
|
|
+ case class CompanyStaffUtil(s: SparkSession,
|
|
|
+ project: String
|
|
|
+ ) extends LoggingUtils with Logging with BaseFunc {
|
|
|
+ @(transient@getter) val spark: SparkSession = s
|
|
|
+
|
|
|
+
|
|
|
+ def init(): Unit = {
|
|
|
+ cleanup()
|
|
|
+ spark.udf.register("agg_val", new CompanyStaffAggs())
|
|
|
+ sql(
|
|
|
+ """
|
|
|
+ |CREATE TABLE IF NOT EXISTS winhc_eci_dev.ads_company_staff
|
|
|
+ |(
|
|
|
+ | rowkey STRING COMMENT 'FIELD'
|
|
|
+ | ,new_cid STRING COMMENT 'FIELD'
|
|
|
+ | ,id BIGINT COMMENT ''
|
|
|
+ | ,cid BIGINT COMMENT ''
|
|
|
+ | ,hid BIGINT COMMENT ''
|
|
|
+ | ,staff_type STRING COMMENT ''
|
|
|
+ | ,create_time DATETIME COMMENT ''
|
|
|
+ | ,update_time DATETIME COMMENT ''
|
|
|
+ | ,deleted BIGINT COMMENT ''
|
|
|
+ |)
|
|
|
+ |COMMENT 'TABLE COMMENT'
|
|
|
+ |PARTITIONED BY
|
|
|
+ |(
|
|
|
+ | ds STRING COMMENT '分区'
|
|
|
+ |)
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |CREATE TABLE IF NOT EXISTS `winhc_eci_dev`.`inc_ads_company_staff` (
|
|
|
+ | `rowkey` STRING COMMENT 'FIELD',
|
|
|
+ | `new_cid` STRING COMMENT 'FIELD',
|
|
|
+ | `id` BIGINT,
|
|
|
+ | `cid` BIGINT,
|
|
|
+ | `hid` BIGINT,
|
|
|
+ | `staff_type` STRING,
|
|
|
+ | `create_time` DATETIME,
|
|
|
+ | `update_time` DATETIME,
|
|
|
+ | `deleted` BIGINT)
|
|
|
+ | COMMENT 'TABLE COMMENT'
|
|
|
+ |PARTITIONED BY (
|
|
|
+ | `ds` STRING COMMENT '分区')
|
|
|
+ |""".stripMargin)
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ def all(): Unit = {
|
|
|
+
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |SELECT a.*
|
|
|
+ | ,coalesce(b.new_cid,a.cid) AS new_cid
|
|
|
+ |FROM winhc_eci_dev.ods_company_staff a
|
|
|
+ |LEFT JOIN winhc_eci_dev.company_map b
|
|
|
+ |ON a.cid = b.cid
|
|
|
+ |WHERE a.ds = 20200604
|
|
|
+ |AND a.cid IS NOT NULL
|
|
|
+ |""".stripMargin)
|
|
|
+ // .cache()
|
|
|
+ .createOrReplaceTempView("tmp_all")
|
|
|
+
|
|
|
+
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |INSERT OVERWRITE TABLE winhc_eci_dev.ads_company_staff PARTITION(ds=20200604)
|
|
|
+ |SELECT t2.rowkey
|
|
|
+ | ,t2.new_cid
|
|
|
+ | ,t2.id
|
|
|
+ | ,t2.cid
|
|
|
+ | ,t2.hid
|
|
|
+ | ,t3.types AS staff_type
|
|
|
+ | ,t2.create_time
|
|
|
+ | ,t2.update_time
|
|
|
+ | ,t2.deleted
|
|
|
+ |FROM (
|
|
|
+ | SELECT rowkey
|
|
|
+ | ,new_cid
|
|
|
+ | ,id
|
|
|
+ | ,cid
|
|
|
+ | ,hid
|
|
|
+ | ,staff_type
|
|
|
+ | ,create_time
|
|
|
+ | ,update_time
|
|
|
+ | ,deleted
|
|
|
+ | FROM (
|
|
|
+ | SELECT *
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY new_cid,hid ORDER BY id DESC ) num
|
|
|
+ | ,CONCAT_WS('_',new_cid,md5(cleanup(CONCAT_WS('',hid)))) AS rowkey
|
|
|
+ | ,cleanup(CONCAT_WS('',hid)) AS cols
|
|
|
+ | FROM tmp_all AS c
|
|
|
+ | ) d
|
|
|
+ | WHERE num = 1
|
|
|
+ | AND cols IS NOT NULL
|
|
|
+ | AND trim(cols) <> ''
|
|
|
+ | ) AS t2
|
|
|
+ |LEFT JOIN (
|
|
|
+ | SELECT t1.new_cid
|
|
|
+ | ,t1.hid
|
|
|
+ | ,agg_val(t1.staff_type) AS types
|
|
|
+ | FROM tmp_all AS t1
|
|
|
+ | GROUP BY t1.new_cid
|
|
|
+ | ,t1.hid
|
|
|
+ | ) AS t3
|
|
|
+ |ON CONCAT_WS('',t2.hid,t2.new_cid) = CONCAT_WS('',t3.hid,t3.new_cid)
|
|
|
+ |""".stripMargin)
|
|
|
+ }
|
|
|
+
|
|
|
+ def inc(): Unit = {
|
|
|
+ val lastDs = getLastPartitionsOrElse("winhc_eci_dev.inc_ads_company_staff", "20200604")
|
|
|
+ val dss = getPartitions("winhc_eci_dev.inc_ods_company_staff").filter(_ > lastDs)
|
|
|
+
|
|
|
+ println("计算分区:" + dss.mkString(","))
|
|
|
+
|
|
|
+ for (ds <- dss) {
|
|
|
+ inc(ds)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ def inc(ds: String): Unit = {
|
|
|
+ val lastPat = getLastPartitionsOrElse("winhc_eci_dev.ads_company_staff", "0")
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |SELECT cid
|
|
|
+ | ,current_cid AS new_cid
|
|
|
+ |FROM winhc_eci_dev.inc_ods_company
|
|
|
+ |WHERE ds = '$ds'
|
|
|
+ |AND cid IS NOT NULL
|
|
|
+ |AND current_cid IS NOT NULL
|
|
|
+ |""".stripMargin)
|
|
|
+ .createOrReplaceTempView("mapping")
|
|
|
+
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |SELECT concat_ws(
|
|
|
+ | '_'
|
|
|
+ | ,coalesce(t2.new_cid,t1.cid)
|
|
|
+ | ,split(rowkey, '_')[1]
|
|
|
+ | ) AS rowkey
|
|
|
+ | ,coalesce(t2.new_cid,t1.cid) as new_cid
|
|
|
+ | ,id
|
|
|
+ | ,t1.cid
|
|
|
+ | ,hid
|
|
|
+ | ,staff_type
|
|
|
+ | ,create_time
|
|
|
+ | ,update_time
|
|
|
+ | ,deleted
|
|
|
+ |FROM (
|
|
|
+ | SELECT *
|
|
|
+ | FROM winhc_eci_dev.ads_company_staff
|
|
|
+ | WHERE ds = '$lastPat'
|
|
|
+ | UNION ALL
|
|
|
+ | SELECT *
|
|
|
+ | FROM winhc_eci_dev.inc_ads_company_staff
|
|
|
+ | WHERE ds > $lastPat
|
|
|
+ | AND ds <= $ds
|
|
|
+ | ) AS t1
|
|
|
+ |JOIN mapping AS t2
|
|
|
+ |ON t1.new_cid = t2.cid
|
|
|
+ |UNION ALL
|
|
|
+ |SELECT rowkey
|
|
|
+ | ,new_cid
|
|
|
+ | ,id
|
|
|
+ | ,cid
|
|
|
+ | ,hid
|
|
|
+ | ,staff_type
|
|
|
+ | ,create_time
|
|
|
+ | ,update_time
|
|
|
+ | ,deleted
|
|
|
+ |FROM (
|
|
|
+ | SELECT *
|
|
|
+ | ,CONCAT_WS('_',new_cid,md5(cleanup(CONCAT_WS('',hid)))) AS rowkey
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY new_cid,hid ORDER BY id DESC ) num
|
|
|
+ | FROM (
|
|
|
+ | SELECT coalesce(ta2.new_cid,ta1.cid) AS new_cid
|
|
|
+ | ,id
|
|
|
+ | ,ta1.cid
|
|
|
+ | ,hid
|
|
|
+ | ,staff_type
|
|
|
+ | ,create_time
|
|
|
+ | ,update_time
|
|
|
+ | ,deleted
|
|
|
+ | FROM (
|
|
|
+ | SELECT *
|
|
|
+ | FROM winhc_eci_dev.inc_ods_company_staff
|
|
|
+ | WHERE ds = '$ds'
|
|
|
+ | ) AS ta1
|
|
|
+ | LEFT JOIN mapping AS ta2
|
|
|
+ | ON ta1.cid = ta2.cid
|
|
|
+ | )
|
|
|
+ | )
|
|
|
+ |WHERE num = 1
|
|
|
+ |""".stripMargin)
|
|
|
+ .cache().createOrReplaceTempView("inc_ads_tmp")
|
|
|
+
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |INSERT OVERWRITE TABLE winhc_eci_dev.inc_ads_company_staff PARTITION(ds=$ds)
|
|
|
+ |SELECT t2.rowkey
|
|
|
+ | ,t2.new_cid
|
|
|
+ | ,t2.id
|
|
|
+ | ,t2.cid
|
|
|
+ | ,t2.hid
|
|
|
+ | ,t3.types AS staff_type
|
|
|
+ | ,t2.create_time
|
|
|
+ | ,t2.update_time
|
|
|
+ | ,t2.deleted
|
|
|
+ |FROM inc_ads_tmp AS t2
|
|
|
+ |LEFT JOIN (
|
|
|
+ | SELECT t1.new_cid
|
|
|
+ | ,t1.hid
|
|
|
+ | ,agg_val(t1.staff_type) AS types
|
|
|
+ | FROM inc_ads_tmp AS t1
|
|
|
+ | GROUP BY t1.new_cid
|
|
|
+ | ,t1.hid
|
|
|
+ | ) AS t3
|
|
|
+ |ON CONCAT_WS('',t2.hid,t2.new_cid) = CONCAT_WS('',t3.hid,t3.new_cid)
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ def main(args: Array[String]): Unit = {
|
|
|
+ val project = "winhc_eci_dev"
|
|
|
+ val config = EsConfig.getEsConfigMap ++ mutable.Map(
|
|
|
+ "spark.hadoop.odps.project.name" -> project,
|
|
|
+ "spark.debug.maxToStringFields" -> "200",
|
|
|
+ "spark.hadoop.odps.spark.local.partition.amt" -> "100"
|
|
|
+ )
|
|
|
+ val spark = SparkUtils.InitEnv("company_staff", config)
|
|
|
+
|
|
|
+ val e = CompanyStaffUtil(spark, project)
|
|
|
+ e.init()
|
|
|
+ if (args.length == 1) {
|
|
|
+ val Array(ds) = args
|
|
|
+ e.inc(ds)
|
|
|
+ } else {
|
|
|
+ e.inc()
|
|
|
+ }
|
|
|
+ spark.stop()
|
|
|
+ }
|
|
|
+}
|