Browse Source

fix: 查老赖修复拉历史数据问题

许家凯 4 years ago
parent
commit
bf77cc2a6a

+ 45 - 44
src/main/scala/com/winhc/bigdata/spark/jobs/deadbeat/deadbeat_info.scala

@@ -215,23 +215,31 @@ case class deadbeat_info(s: SparkSession,
     sql(
       s"""
          |SELECT  *
-         |FROM    winhc_eci_dev.ads_deadbeat_person
-         |WHERE   ds > '$target_last_ds'
-         |AND     card_num IS NOT NULL
-         |UNION ALL
-         |SELECT  t2.*
          |FROM    (
-         |            SELECT  DISTINCT CONCAT(name,card_num) AS KEY
-         |            FROM    winhc_eci_dev.ads_deadbeat_person
-         |            WHERE   ds > '$target_last_ds'
-         |            AND     card_num IS NOT NULL
-         |        ) AS t1
-         |JOIN    (
          |            SELECT  *
-         |            FROM    winhc_eci_dev.ads_deadbeat_person
-         |            WHERE   ds <= '$target_last_ds'
-         |        ) AS t2
-         |ON      t1.KEY = CONCAT(t2.name,t2.card_num)
+         |                    ,ROW_NUMBER() OVER(PARTITION BY rowkey ORDER BY ds DESC ) AS num
+         |            FROM    (
+         |                        SELECT  *
+         |                        FROM    winhc_eci_dev.ads_deadbeat_person
+         |                        WHERE   ds > '$target_last_ds'
+         |                        AND     card_num IS NOT NULL
+         |                        UNION ALL
+         |                        SELECT  t2.*
+         |                        FROM    (
+         |                                    SELECT  DISTINCT CONCAT(name,card_num) AS my_key
+         |                                    FROM    winhc_eci_dev.ads_deadbeat_person
+         |                                    WHERE   ds > '$target_last_ds'
+         |                                    AND     card_num IS NOT NULL
+         |                                ) AS t1
+         |                        JOIN    (
+         |                                    SELECT  *
+         |                                    FROM    winhc_eci_dev.ads_deadbeat_person
+         |                                    WHERE   ds <= '$target_last_ds'
+         |                                ) AS t2
+         |                        ON      t1.my_key = CONCAT(t2.name,t2.card_num)
+         |                    ) AS t3
+         |        ) AS t4
+         |WHERE   t4.num = 1
          |""".stripMargin)
       .createOrReplaceTempView("tmp_ads_deadbeat_person_all_tmp")
 
@@ -263,23 +271,8 @@ case class deadbeat_info(s: SparkSession,
          |            FROM    tmp_ads_deadbeat_person_all_tmp
          |            GROUP BY name
          |                     ,card_num
-         |---            UNION ALL
-         |---            SELECT  md5(cleanup(CONCAT_WS('',rowkey,name))) AS id
-         |---                    ,name
-         |---                    ,card_num
-         |---                    ,NULL AS birth_year
-         |---                    ,NULL AS gender
-         |---                    ,NULL AS province
-         |---                    ,NULL AS city
-         |---                    ,NULL AS district
-         |---                    ,get_empty_map(rowkey,tn,deleted,publish_date) AS labels
-         |---            FROM    winhc_eci_dev.ads_deadbeat_person
-         |---            WHERE   ds > $target_last_ds
-         |---            AND     card_num IS NULL
          |        )
          |""".stripMargin)
-    //      .show(10000)
-
   }
 
 
@@ -315,22 +308,30 @@ case class deadbeat_info(s: SparkSession,
     sql(
       s"""
          |SELECT  *
-         |FROM    winhc_eci_dev.ads_deadbeat_company
-         |WHERE   ds > '$target_last_ds'
-         |UNION ALL
-         |SELECT  t2.*
          |FROM    (
-         |            SELECT  DISTINCT cid
-         |            FROM    winhc_eci_dev.ads_deadbeat_company
-         |            WHERE   ds > '$target_last_ds'
-         |            AND     cid is not null
-         |        ) AS t1
-         |JOIN    (
          |            SELECT  *
-         |            FROM    winhc_eci_dev.ads_deadbeat_company
-         |            WHERE   ds <= '$target_last_ds'
-         |        ) AS t2
-         |ON      t1.cid = t2.cid
+         |                    ,ROW_NUMBER() OVER(PARTITION BY rowkey ORDER BY ds DESC ) AS num
+         |            FROM    (
+         |                        SELECT  *
+         |                        FROM    winhc_eci_dev.ads_deadbeat_company
+         |                        WHERE   ds > '$target_last_ds'
+         |                        UNION ALL
+         |                        SELECT  t2.*
+         |                        FROM    (
+         |                                    SELECT  DISTINCT cid
+         |                                    FROM    winhc_eci_dev.ads_deadbeat_company
+         |                                    WHERE   ds > '$target_last_ds'
+         |                                    AND     cid IS NOT NULL
+         |                                ) AS t1
+         |                        JOIN    (
+         |                                    SELECT  *
+         |                                    FROM    winhc_eci_dev.ads_deadbeat_company
+         |                                    WHERE   ds <= '$target_last_ds'
+         |                                ) AS t2
+         |                        ON      t1.cid = t2.cid
+         |                    ) AS t3
+         |        ) AS t4
+         |WHERE   t4.num = 1
          |""".stripMargin).createOrReplaceTempView("all_deadbeat_tmp_company_tmp")
 
     sql(

+ 7 - 58
src/main/scala/com/winhc/bigdata/spark/jobs/deadbeat/dishonest_info.scala

@@ -24,6 +24,10 @@ case class dishonest_info(s: SparkSession,
       case true => "company_dishonest_info_human"
       case false => "company_dishonest_info"
     }
+    val cid = is_person match {
+      case true => ""
+      case false => ",cid"
+    }
 
     val target_table = is_person match {
       case true => s"$project.ads_deadbeat_person"
@@ -40,12 +44,7 @@ case class dishonest_info(s: SparkSession,
     val view_fields =
       s"""
          |rowkey
-         |${
-        is_person match {
-          case true => ""
-          case false => ",cid"
-        }
-      }
+         |${cid}
          |,name
          |,card_num
          |,pub_date as publish_date
@@ -83,68 +82,18 @@ case class dishonest_info(s: SparkSession,
            |FROM    $inc_ads_table
            |WHERE   ds > '$last_ds'
            |""".stripMargin)
-        .cache()
         .createOrReplaceTempView(tmp_tab)
 
       sql(
         s"""
            |INSERT OVERWRITE TABLE $target_table PARTITION(ds='$inc_ads_last_ds',tn='$tn')
            |SELECT  rowkey
-           |         ${
-          is_person match {
-            case true => ""
-            case false => ",cid"
-          }
-        }
+           |        ${cid}
            |        ,name
            |        ,card_num
            |        ,publish_date
            |        ,deleted
-           |FROM    (
-           |            SELECT  *
-           |                    ,ROW_NUMBER() OVER(PARTITION BY rowkey ORDER BY ds DESC ) AS num
-           |            FROM    (
-           |                        SELECT  rowkey
-           |                        ${
-          is_person match {
-            case true => ""
-            case false => ",cid"
-          }
-        }
-           |                                ,name
-           |                                ,card_num
-           |                                ,publish_date
-           |                                ,deleted
-           |                                ,ds
-           |                        FROM    $tmp_tab
-           |                        UNION ALL
-           |                        SELECT  t2.rowkey
-           |                        ${
-          is_person match {
-            case true => ""
-            case false => ",t2.cid"
-          }
-        }
-           |                                ,t2.name
-           |                                ,t2.card_num
-           |                                ,t2.publish_date
-           |                                ,t2.deleted
-           |                                ,t2.ds
-           |                        FROM    (
-           |                                    SELECT  DISTINCT concat_ws('_',name,card_num) AS d_id
-           |                                    FROM    $tmp_tab
-           |                                ) AS t1
-           |                        JOIN    (
-           |                                    SELECT  concat_ws('_',name,card_num) AS d_id
-           |                                            ,*
-           |                                    FROM    $target_table
-           |                                    WHERE   ds > '$last_ds'
-           |                                    AND     tn = '$tn'
-           |                                ) AS t2
-           |                        ON      t1.d_id = t2.d_id
-           |                    ) AS t3
-           |        ) AS t4
-           |WHERE   t4.num = 1
+           |FROM    $tmp_tab
            |""".stripMargin)
 
     }