|
@@ -2,6 +2,7 @@ package com.winhc.bigdata.spark.utils
|
|
|
|
|
|
import java.util.Date
|
|
|
|
|
|
+import com.winhc.bigdata.spark.udf.CompanyMapping
|
|
|
import org.apache.spark.sql.SparkSession
|
|
|
|
|
|
import scala.annotation.meta.getter
|
|
@@ -11,40 +12,47 @@ import scala.annotation.meta.getter
|
|
|
* 拆平cids,落表
|
|
|
*/
|
|
|
|
|
|
-case class CompanyForCidsUtils(s: SparkSession, sourceTable: String, cols: Seq[String]) extends LoggingUtils {
|
|
|
+case class CompanyForCidsUtils(s: SparkSession, space: String, sourceTable: String, cols: Seq[String]) extends LoggingUtils with CompanyMapping{
|
|
|
@(transient@getter) val spark: SparkSession = s
|
|
|
|
|
|
import spark.implicits._
|
|
|
import spark._
|
|
|
import org.apache.spark.sql.functions._
|
|
|
|
|
|
+ val tabMapping =
|
|
|
+ Map("company_court_open_announcement" -> ("litigant_cids", ";") //开庭公告
|
|
|
+ )
|
|
|
+
|
|
|
def calc(): Unit = {
|
|
|
println(s"${this.getClass.getSimpleName} calc start! " + new Date().toString)
|
|
|
- val odsTable = s"ods_$sourceTable"
|
|
|
- val adsListTable = s"ads_${sourceTable}_list"
|
|
|
- val adsTable = s"ads_$sourceTable"
|
|
|
- val companyMapping = "company_name_mapping_pro"
|
|
|
+ val odsTable = s"${space}.ods_$sourceTable"
|
|
|
+ val adsListTable = s"${space}.ads_${sourceTable}_list"
|
|
|
+ val adsTable = s"${space}.ads_$sourceTable"
|
|
|
+ val companyMapping = s"${space}.company_name_mapping_pro_v2"
|
|
|
+ prepareFunctions(spark)
|
|
|
+
|
|
|
val ds = BaseUtil.getPartion(odsTable, spark)
|
|
|
+
|
|
|
+ val (split_cols,delimiter) = tabMapping.getOrElse(sourceTable,("cids",";"))
|
|
|
+
|
|
|
//table字段
|
|
|
val columns: Seq[String] = spark.table(odsTable).schema.map(_.name).filter(!_.equals("ds"))
|
|
|
+ val list_columns: Seq[String] = spark.table(adsListTable).schema.map(_.name).filter(!_.equals("ds"))
|
|
|
val disCol = cols
|
|
|
-
|
|
|
- sql(s"select * from $odsTable where ds = $ds and cids is not null and trim(cids) <> '' ")
|
|
|
- .dropDuplicates(disCol)
|
|
|
- .createOrReplaceTempView("t1")
|
|
|
-
|
|
|
- sql(s"CACHE TABLE t1")
|
|
|
+ val cols_md5 = disCol.filter(!_.equals("new_cid"))
|
|
|
|
|
|
//拆平新表
|
|
|
sql(
|
|
|
s"""
|
|
|
+ |insert overwrite table ${adsListTable} partition (ds=${ds})
|
|
|
|SELECT
|
|
|
- | rowkey,new_cid,${columns.mkString(",")}
|
|
|
+ | ${list_columns.mkString(",")}
|
|
|
|FROM (
|
|
|
| SELECT
|
|
|
| *
|
|
|
- | ,ROW_NUMBER() OVER (PARTITION BY id,new_cid ORDER BY - ABS(CAST(new_cid AS BIGINT )- CAST(cid AS BIGINT )) DESC ) num
|
|
|
- | ,CONCAT_WS('_',new_cid,id) AS rowkey
|
|
|
+ | ,ROW_NUMBER() OVER (PARTITION BY ${disCol.mkString(",")} ORDER BY - ABS(CAST(new_cid AS BIGINT )- CAST(cid AS BIGINT )) DESC ) num
|
|
|
+ | ,CONCAT_WS('_',new_cid,md5(cleanup(CONCAT_WS('',${cols_md5.mkString(",")})))) AS rowkey
|
|
|
+ | ,cleanup(CONCAT_WS('',${cols_md5.mkString(",")})) AS cols
|
|
|
| FROM (
|
|
|
| SELECT
|
|
|
| c.*
|
|
@@ -52,40 +60,42 @@ case class CompanyForCidsUtils(s: SparkSession, sourceTable: String, cols: Seq[S
|
|
|
| FROM (
|
|
|
| SELECT
|
|
|
| *
|
|
|
- | FROM t1 a
|
|
|
- | LATERAL VIEW explode(split(cids,';')) b AS cid
|
|
|
+ | FROM $odsTable a
|
|
|
+ | LATERAL VIEW explode(split($split_cols,'$delimiter')) b AS cid
|
|
|
+ | WHERE a.ds = $ds
|
|
|
+ | AND $split_cols is not null
|
|
|
+ | AND trim($split_cols) <> ''
|
|
|
| ) c
|
|
|
| LEFT JOIN $companyMapping d
|
|
|
| ON c.cid = d.cid
|
|
|
| ) e
|
|
|
| ) f
|
|
|
- |WHERE num =1
|
|
|
+ |WHERE num =1 AND cols is not null AND trim(cols) <> ''
|
|
|
|""".stripMargin)
|
|
|
.createOrReplaceTempView(s"t2")
|
|
|
|
|
|
//聚合新cids
|
|
|
- val df1 = sql(
|
|
|
+ sql(
|
|
|
s"""
|
|
|
+ |insert overwrite table ${adsTable} partition (ds=${ds})
|
|
|
|SELECT
|
|
|
- |x.new_cids,${columns.mkString(",")}
|
|
|
- |FROM t1
|
|
|
+ |md5(cleanup(CONCAT_WS('',${cols_md5.mkString(",")}))) AS rowkey
|
|
|
+ |,x.new_cids
|
|
|
+ |,${columns.mkString(",")}
|
|
|
+ |FROM $odsTable y
|
|
|
|LEFT JOIN (
|
|
|
| SELECT id as new_id
|
|
|
| ,concat_ws(';',collect_set(new_cid)) new_cids
|
|
|
- | FROM t2
|
|
|
+ | FROM $adsListTable t
|
|
|
+ | WHERE ds = $ds
|
|
|
| GROUP BY id
|
|
|
| ) x
|
|
|
- |ON t1.id = x.new_id
|
|
|
- |""".stripMargin)
|
|
|
-
|
|
|
- df1.createOrReplaceTempView("t3")
|
|
|
-
|
|
|
- sql(s"select rowkey,new_cid,${columns.mkString(",")} from t2").show(10)
|
|
|
- sql(s"select new_cids,${columns.mkString(",")} from t3").show(10)
|
|
|
+ |ON y.id = x.new_id
|
|
|
+ |WHERE y.ds = 20200604
|
|
|
+ |AND $split_cols IS NOT NULL
|
|
|
+ |AND trim($split_cols) <> ''
|
|
|
+ |""".stripMargin).createOrReplaceTempView("t3")
|
|
|
|
|
|
- //写表
|
|
|
- sql(s"insert overwrite table ${adsListTable} partition (ds=${ds}) select rowkey,new_cid,${columns.mkString(",")} from t2")
|
|
|
- sql(s"insert overwrite table ${adsTable} partition (ds=${ds}) select new_cids,${columns.mkString(",")} from t3")
|
|
|
println(s"${this.getClass.getSimpleName} calc end! " + new Date().toString)
|
|
|
}
|
|
|
}
|