Преглед на файлове

fix: 身份证号校验单独提到utils中

许家凯 преди 4 години
родител
ревизия
b4669c5e95

+ 2 - 23
src/main/scala/com/winhc/bigdata/spark/udf/BaseFunc.scala

@@ -1,7 +1,6 @@
 package com.winhc.bigdata.spark.udf
 
 import com.winhc.bigdata.spark.implicits.CompanyIndexSave2EsHelper
-import com.winhc.bigdata.spark.implicits.RegexUtils._
 import com.winhc.bigdata.spark.utils.BaseUtil
 import com.winhc.bigdata.spark.utils.BaseUtil._
 import org.apache.commons.lang3.StringUtils
@@ -21,7 +20,6 @@ trait BaseFunc {
   @(transient@getter) protected val spark: SparkSession
   private val pattern = "[^\\u4e00-\\u9fa5a-zA-Z \\(\\)().]+".r
 
-  private val id_card_pattern = "^[1-9]\\d{5}(19|20)\\d{2}((0[1-9])|(1[0-2])|([0-1]\\*{1,2})|\\*{2})(([0-2][1-9])|10|20|30|31|\\*{2})\\d{3}[0-9Xx]$".r
 
 
   /* def to_epoch_millis_timestamp(): Unit = {
@@ -38,27 +36,8 @@ trait BaseFunc {
     spark.udf.register("id_card_trim", id_card_trim _)
   }
 
-  //身份证号格式统一:若判断是身份证则修剪一下,否则原样返回
-  def id_card_trimOrRaw_udf(): Unit = {
-    spark.udf.register("id_card_trimOrRaw", id_card_trimOrRaw _)
-  }
-
-  def id_card_trimOrRaw(str: String): String = {
-    if (StringUtils.isNotBlank(str) && (id_card_pattern matches str)) {
-      return s"${str.substring(0, 10)}****${str.substring(str.length - 4)}".toUpperCase
-    }
-    str
-  }
-
-  def is_id_card(): Unit = {
-    val maxYear = BaseUtil.nowDate(pattern = "yyyy").toInt
-    spark.udf.register("is_id_card", (str: String) => {
-      if (id_card_pattern matches str) {
-        val d = str.substring(6, 10).toInt
-        return d <= maxYear
-      } else
-        false
-    })
+  def is_id_card_udf(): Unit = {
+    spark.udf.register("is_id_card", is_id_card _)
   }
 
   def code2Name(): (Broadcast[Map[String, Seq[String]]], Broadcast[Map[String, Seq[String]]]) = {

+ 22 - 7
src/main/scala/com/winhc/bigdata/spark/utils/BaseUtil.scala

@@ -226,7 +226,7 @@ object BaseUtil {
   }
 
   def title(ygname: String, bgname: String, reason: String): String = {
-     Seq(replaceChar(ygname),replaceChar(bgname),reason).filter(s=>StringUtils.isNotBlank(s)).mkString(",")
+    Seq(replaceChar(ygname), replaceChar(bgname), reason).filter(s => StringUtils.isNotBlank(s)).mkString(",")
   }
 
   def trimBlack(s: String): String = {
@@ -321,6 +321,21 @@ object BaseUtil {
     null
   }
 
+  private val id_card_pattern = "^[1-9]\\d{5}(19|20)\\d{2}((0[1-9])|(1[0-2])|([0-1]\\*{1,2})|\\*{2})(([0-2][1-9])|10|20|30|31|\\*{2})\\d{3}[0-9Xx]$".r
+
+  private lazy val maxYear = BaseUtil.nowDate(pattern = "yyyy").toInt
+
+  def is_id_card(str: String): Boolean = {
+    if (StringUtils.isEmpty(str))
+      return false
+
+    if (id_card_pattern matches str) {
+      val d = str.substring(6, 10).toInt
+      d <= maxYear
+    } else
+      false
+  }
+
   val pat = "(?<province>[^省]+省|.+自治区)?(?<city>[^自治州]+自治州|[^市]+市|[^盟]+盟|[^地区]+地区|.+区划)?(?<district>[^市]+市|[^县]+县|[^旗]+旗|.+区)?".r
 
   /**
@@ -341,12 +356,12 @@ object BaseUtil {
   }
 
   def main(args: Array[String]): Unit = {
-    println(title("xx",null,"reason"))
-    println(parseAddress("大石桥市人民法院"))
-    println(case_no_trim("(2015)怀执字第03601号号"))
-    val seq = Seq("1", "3", "2", "7").mkString("\001")
-    println(sortString(seq))
-    println(id_card_trim("41111119990****062x"))
+//    println(title("xx", null, "reason"))
+//    println(parseAddress("大石桥市人民法院"))
+//    println(case_no_trim("(2015)怀执字第03601号号"))
+//    val seq = Seq("1", "3", "2", "7").mkString("\001")
+//    println(sortString(seq))
+    println(is_id_card("4111111999****062x"))
   }
 
 }

+ 1 - 1
src/main/scala/com/winhc/bigdata/spark/utils/Company_Completion_Utils.scala

@@ -62,7 +62,7 @@ case class Company_Completion_Utils(s: SparkSession,
       println("not all tables have the same partition of newest !!!")
       sys.exit(-1)
     }
-    id_card_trimOrRaw_udf()
+    id_card_trim_udf()
     lastDsIncOds = minDs
     spark.sparkContext.setJobDescription(s"补全企业cid:${mapTables.size}个表+企业映射表聚合($lastDsIncOds)")
     sql(mapTables.map(m => {

+ 3 - 3
src/main/scala/com/winhc/bigdata/spark/utils/IDCard_Completion_Utils.scala

@@ -62,8 +62,8 @@ case class IDCard_Completion_Utils(s: SparkSession,
       println("not all tables have the same partition of newest !!!")
       sys.exit(-1)
     }
-    is_id_card()
-    id_card_trimOrRaw_udf()
+    is_id_card_udf()
+    id_card_trim_udf()
     lastDsIncOds = minDs
     spark.sparkContext.setJobDescription(s"补全身份证号码:${mapTables.size}个表聚合($lastDsIncOds)")
     sql(mapTables.map(m => {
@@ -96,7 +96,7 @@ case class IDCard_Completion_Utils(s: SparkSession,
     sql(
       s"""
          |INSERT ${if (isWindows) "INTO" else "OVERWRITE"} table $project.ads_person_idcard_cloze partition(ds='$lastDsIncOds')
-         |SELECT name, id_card_trimOrRaw(identity_num), company_name, case_no, court_name, source, flag
+         |SELECT name, id_card_trim(identity_num), company_name, case_no, court_name, source, flag
          |FROM(
          |  SELECT name, identity_num, company_name, case_no, court_name, source, flag
          |         ,ROW_NUMBER() OVER (PARTITION BY name,case_no ORDER BY identity_num DESC) num