Explorar el Código

fix: 调整索引

许家凯 hace 2 años
padre
commit
6efd15b05f

+ 6 - 1
pom.xml

@@ -359,7 +359,12 @@
             <artifactId>guava</artifactId>
             <version>23.0</version>
         </dependency>
-
+        <!-- https://mvnrepository.com/artifact/com.github.stuxuhai/jpinyin -->
+        <dependency>
+            <groupId>com.github.stuxuhai</groupId>
+            <artifactId>jpinyin</artifactId>
+            <version>1.1.8</version>
+        </dependency>
 
 
         <!--

+ 12 - 8
src/main/scala/com/winhc/bigdata/spark/ng/jobs/CompanyIndexJob.scala

@@ -48,16 +48,22 @@ case class CompanyIndexJob(s: SparkSession,
 
     spark.udf.register("icp_domain_trim", icp_web_site _)
 
-
     company_org_type_udf()
     cleanup()
     area_code()
-    import com.winhc.bigdata.spark.implicits.CaseClass2JsonHelper._
-    def history_name(cname: String, history_names: String): String = CompanyIndexUtils.getHistoryName(cname, history_names).toJson()
+
+    def history_name(cname: String, history_names: String): String = {
+      val names = CompanyIndexUtils.getHistoryName(cname, history_names)
+      if (names == null || names.isEmpty) {
+        null
+      } else {
+        names.map(_.buildJson).mkString("[", ",", "]")
+      }
+    }
 
     spark.udf.register("get_history_name", history_name _)
 
-    def company(name: String): String = CompanyIndexUtils.getCompanyName(name).toJson()
+    def company(name: String): String = CompanyIndexUtils.getCompanyName(name).buildJson
 
     spark.udf.register("get_company", company _)
 
@@ -73,9 +79,7 @@ case class CompanyIndexJob(s: SparkSession,
     spark.udf.register("get_phones_emails", get_phones_emails _)
     spark.udf.register("to_millis_timestamp", tmp _)
 
-    def hu(id: String, name: String): String = CompanyIndexUtils.getHuman(id, name).toJson()
-
-    spark.udf.register("get_human", hu _)
+    spark.udf.register("get_human", CompanyIndexUtils.getHuman _)
 
     spark.udf.register("get_holder", CompanyIndexUtils.get_holder _)
 
@@ -106,7 +110,7 @@ case class CompanyIndexJob(s: SparkSession,
     spark.udf.register("get_amount", RegCapitalAmount.getAmount _)
   }
 
-  private val target_tab = "winhc_ng.out_company_v8_index"
+  private val target_tab = "winhc_ng.out_company_v8_index_test"
 
 
   private val org_prefix = "ads"

+ 61 - 9
src/main/scala/com/winhc/bigdata/spark/utils/CompanyIndexUtils.scala

@@ -1,5 +1,6 @@
 package com.winhc.bigdata.spark.utils
 
+import com.github.stuxuhai.jpinyin.ChineseHelper
 import com.winhc.bigdata.spark.implicits.CaseClass2JsonHelper._
 import com.winhc.bigdata.spark.utils.BaseUtil.cleanup
 import org.apache.commons.lang3.StringUtils
@@ -9,17 +10,60 @@ import org.apache.commons.lang3.StringUtils
  * @date: 2020/11/23 10:46
  */
 
-case class human(id: String, name: String)
+case class human(id: String, name: String) {
+  def buildJson: String = {
+    val simplifiedChinese = CompanyIndexUtils.convertToSimplifiedChinese(name)
+    if (simplifiedChinese.equals(name)) {
+      Map("id" -> id,
+        "name" -> name
+      ).toJson()
+    } else {
+      Map("id" -> id,
+        "name" -> simplifiedChinese
+      ).toJson()
+    }
+  }
+}
 
-case class holder(id: String, `type`: String, name: String)
+case class holder(id: String, `type`: String, name: String) {
+  def buildJson: String = {
+    val simplifiedChinese = CompanyIndexUtils.convertToSimplifiedChinese(name)
+    if (name.equals(simplifiedChinese)) {
+      Map("id" -> id,
+        "type" -> `type`,
+        "name" -> name
+      ).toJson()
+    } else {
+      Map("id" -> id,
+        "type" -> `type`,
+        "name" -> simplifiedChinese
+      ).toJson()
+    }
+  }
+}
 
-case class CompanyName(show: String, value: String)
+case class CompanyName(show: String, value: String) {
+  def buildJson: String = {
+    val simplifiedChineseValue = CompanyIndexUtils.convertToSimplifiedChinese(value)
+    val simplifiedChinese = CompanyIndexUtils.convertToSimplifiedChinese(show)
+
+    if (show.equals(simplifiedChinese))
+      Map("show" -> show,
+        "value" -> value
+      ).toJson()
+    else
+      Map("show" -> show,
+        "value" -> simplifiedChineseValue,
+        "simplified_chinese" -> simplifiedChinese
+      ).toJson()
+  }
+}
 
 
 object CompanyIndexUtils {
-  def getHuman(id: String, name: String): human = human(id, name)
+  def getHuman(id: String, name: String): String = human(id, name).buildJson
 
-  def get_holder(id: String, `type`: String, name: String): String = holder(id, `type`, name).toJson()
+  def get_holder(id: String, `type`: String, name: String): String = holder(id, `type`, name).buildJson
 
 
   def getCompanyName(name: String): CompanyName = {
@@ -35,8 +79,9 @@ object CompanyIndexUtils {
       null
     } else {
       val res = getSplit(names)
+        .filter(StringUtils.isNotBlank(_))
+        .map(StringUtils.trim(_))
         .filter(!cname.equals(_))
-        .filter(StringUtils.isNoneEmpty(_))
         .map(getCompanyName)
       if (res.isEmpty) {
         null
@@ -71,9 +116,16 @@ object CompanyIndexUtils {
     }
   }
 
+  def convertToSimplifiedChinese(name: String): String = {
+    if (StringUtils.isEmpty(name)) null
+    else ChineseHelper.convertToSimplifiedChinese(name)
+  }
+
+
   def main(args: Array[String]): Unit = {
-    println(company_score_weight("存续(在营、开业、在册)","新疆现代特油科技股份有限公司","200309577000000","1"))
+    //    println(company_score_weight("存续(在营、开业、在册)","新疆现代特油科技股份有限公司","200309577000000","1"))
+
+    println(getCompanyName("香港中旅(中國)國際投資有限公司").buildJson)
   }
-  //200309577000000
-  //20030957700.00
+
 }