ソースを参照

fix: 过滤爬虫上传的脏数据

许家凯 3 年 前
コミット
908fad384d

+ 4 - 7
src/main/scala/com/winhc/bigdata/spark/ng/jobs/general_handler.scala

@@ -51,13 +51,10 @@ case class general_handler(s: SparkSession,
 
   val clean_up =
     s"""
-       |${
-      inter_cols.contains("company_id") match {
-        case true => "company_id <> '0' AND"
-        case false => ""
-      }
-    }
-       |trim(concat_ws('',${md5_fields.mkString(",")})) <> ''
+       |company_id <> '0'
+       |AND company_id is not null
+       |AND ${md5_fields.map(" " + _ + " is not null ").mkString("AND")}
+       |AND trim(concat_ws('',${md5_fields.mkString(",")})) <> ''
        |""".stripMargin
 
   val up = inter_cols.contains("update_time") match {

+ 1 - 0
src/main/scala/com/winhc/bigdata/spark/ng/jobs/inc_company_ng.scala

@@ -122,6 +122,7 @@ case class inc_company_ng(s: SparkSession,
          |                        SELECT  *
          |                        FROM    $inc_ods_tab
          |                        WHERE   ds > $org_ds
+         |                        AND     company_id is not null
          |                        AND     name is not null
          |                        AND     trim(name) <> ''
          |                    ) AS t1