Przeglądaj źródła

对天眼查4大个人数据数据采用身份证正则校验判断是否为自然人

晏永年 4 lat temu
rodzic
commit
f6d4383a53

+ 3 - 3
src/main/scala/com/winhc/bigdata/spark/utils/IDCard_Completion_Utils.scala

@@ -25,6 +25,7 @@ case class IDCard_Completion_Utils(s: SparkSession,
     println(s"${this.getClass.getSimpleName} calc start! " + new Date().toString)
 
     prepareFunctions(spark)
+    is_id_card_udf()
 
     //参与补全的表
     var mapTables = new mutable.HashMap[String, (String, String, String, String, String, String, String)]()
@@ -62,7 +63,6 @@ case class IDCard_Completion_Utils(s: SparkSession,
       println("not all tables have the same partition of newest !!!")
       sys.exit(-1)
     }
-    is_id_card_udf()
     id_card_trim_udf()
     lastDsIncOds = minDs
     spark.sparkContext.setJobDescription(s"补全身份证号码:${mapTables.size}个表聚合($lastDsIncOds)")
@@ -70,11 +70,11 @@ case class IDCard_Completion_Utils(s: SparkSession,
       s"""
          |SELECT ${m._2._2} AS name, ${m._2._3} AS identity_num, ${m._2._4} AS company_name, ${m._2._5} AS case_no, ${m._2._6} AS court_name, ${m._2._7} AS source, ${m._2._7} AS flag
          |FROM $project.ods_${m._1}
-         |WHERE ds>'0' AND ${m._2._1} IS NULL
+         |WHERE ds>'0' AND is_id_card${m._2._3}//严格限制必须有符合要求的身份证号码
          |UNION ALL
          |SELECT ${m._2._2} AS name, ${m._2._3} AS identity_num, ${m._2._4} AS company_name, ${m._2._5} AS case_no, ${m._2._6} AS court_name, ${m._2._7} AS source, ${m._2._7} AS flag
          |FROM $project.inc_ods_${m._1}
-         |WHERE ds>'0' AND ${m._2._1} IS NULL
+         |WHERE ds>'0' AND is_id_card${m._2._3}//严格限制必须有符合要求的身份证号码
          |""".stripMargin
     }).toArray.mkString(" UNION ALL ")
     ).where("name IS NOT NULL AND case_no IS NOT NULL AND LENGTH(name)>0 AND LENGTH(case_no)>0")