Browse Source

feat: 添加案号规整函数

许家凯 4 years ago
parent
commit
7046d87aee

+ 5 - 1
src/main/scala/com/winhc/bigdata/spark/udf/BaseFunc.scala

@@ -1,6 +1,7 @@
 package com.winhc.bigdata.spark.udf
 
 import com.winhc.bigdata.spark.implicits.CompanyIndexSave2EsHelper
+import com.winhc.bigdata.spark.implicits.RegexUtils._
 import com.winhc.bigdata.spark.utils.BaseUtil
 import com.winhc.bigdata.spark.utils.BaseUtil._
 import org.apache.commons.lang3.StringUtils
@@ -29,8 +30,11 @@ trait BaseFunc {
      })
    }*/
 
+  def case_no_trim_udf(): Unit = {
+    spark.udf.register("case_no_trim", case_no_trim _)
+  }
+
   def is_id_card(): Unit = {
-    import com.winhc.bigdata.spark.implicits.RegexUtils._
     spark.udf.register("is_id_card", (str: String) => id_card_pattern matches str)
   }
 

+ 41 - 3
src/main/scala/com/winhc/bigdata/spark/utils/BaseUtil.scala

@@ -5,6 +5,7 @@ import java.util.regex.Pattern
 import java.util.{Calendar, Date, Locale}
 
 import cn.hutool.core.util.StrUtil
+import com.winhc.bigdata.spark.implicits.RegexUtils._
 import org.apache.commons.lang3.StringUtils
 import org.apache.commons.lang3.time.DateFormatUtils
 import org.apache.spark.sql.SparkSession
@@ -215,10 +216,47 @@ object BaseUtil {
     r
   }
 
+
+  /**
+   * 全角字符转半角字符
+   *
+   * @param str
+   * @return
+   */
+  def toDBC(str: String): String = {
+    val c = str.toCharArray
+    for (i <- 0 until c.length) {
+      if (c(i) == 12288) {
+        c(i) = 32.toChar
+      }
+      if (c(i) > 65280 && c(i) < 65375) c(i) = (c(i) - 65248).toChar
+    }
+    new String(c)
+  }
+
+  private val case_pat = ".*([(\\(]\\d{4}[)\\)][^号]*号?).*".r
+  private val year_pat = "(\\d{4}?)年".r
+
+  /**
+   * 案号格式规整
+   *
+   * @param str
+   * @return
+   */
+  def case_no_trim(str: String): String = {
+    if (StringUtils.isBlank(str)) return null
+    var case_no = toDBC(str)
+    case_no = case_no.replace(" ", "");
+    if (case_no.length < 8) return null
+    case_no = year_pat.replaceAllIn(case_no, "\\($1\\)")
+
+    if (case_pat matches case_no) {
+      case_pat.replaceAllIn(case_no, "$1")
+    } else null
+  }
+
   def main(args: Array[String]): Unit = {
-    println(label("1"))
-    println(label("0"))
-    println(label("2"))
+    println(case_no_trim("2015年怀执字第03601号号"))
   }
 
 }