瀏覽代碼

fix: 过滤爬虫上传的脏数据

许家凯 3 年之前
父節點
當前提交
4e2708eb9d

+ 1 - 0
src/main/scala/com/winhc/bigdata/spark/ng/jobs/general_handler.scala

@@ -53,6 +53,7 @@ case class general_handler(s: SparkSession,
     s"""
        |company_id <> '0'
        |AND company_id is not null
+       |AND trim(company_id) <> ''
        |AND ${md5_fields.map(" " + _ + " is not null ").mkString("AND")}
        |AND trim(concat_ws('',${md5_fields.mkString(",")})) <> ''
        |""".stripMargin

+ 9 - 0
src/main/scala/com/winhc/bigdata/spark/ng/jobs/inc_company_ng.scala

@@ -57,10 +57,18 @@ case class inc_company_ng(s: SparkSession,
          |                        SELECT  *
          |                        FROM    $ods_tab
          |                        WHERE   ds > 0
+         |                        AND     company_id is not null
+         |                        AND     trim(company_id) <> ''
+         |                        AND     name is not null
+         |                        AND     trim(name) <> ''
          |                        UNION ALL
          |                        SELECT  *
          |                        FROM    $inc_ods_tab
          |                        WHERE   ds > 0
+         |                        AND     company_id is not null
+         |                        AND     trim(company_id) <> ''
+         |                        AND     name is not null
+         |                        AND     trim(name) <> ''
          |                    ) AS t1
          |        ) AS t2
          |WHERE   t2.num = 1
@@ -123,6 +131,7 @@ case class inc_company_ng(s: SparkSession,
          |                        FROM    $inc_ods_tab
          |                        WHERE   ds > $org_ds
          |                        AND     company_id is not null
+         |                        AND     trim(company_id) <> ''
          |                        AND     name is not null
          |                        AND     trim(name) <> ''
          |                    ) AS t1