浏览代码

fix: 企业动态清理html字符

许家凯 4 年之前
父节点
当前提交
4d642b1d48

+ 16 - 4
src/main/scala/com/winhc/bigdata/spark/jobs/dynamic/CompanyDynamic.scala

@@ -4,6 +4,7 @@ import java.util.Date
 
 import com.winhc.bigdata.spark.config.EsConfig
 import com.winhc.bigdata.spark.jobs.message.IntellectualMessage
+import com.winhc.bigdata.spark.udf.BaseFunc
 import com.winhc.bigdata.spark.utils.BaseUtil.isWindows
 import com.winhc.bigdata.spark.utils.ReflectUtils.getClazz
 import com.winhc.bigdata.spark.utils.{AsyncExtract, LoggingUtils, SparkUtils}
@@ -29,7 +30,7 @@ object CompanyDynamic {
                                 project: String, //表所在工程名
                                 ds: String //此维度主键
 
-                               ) extends LoggingUtils with Logging {
+                               ) extends LoggingUtils with Logging with BaseFunc{
     @(transient@getter) val spark: SparkSession = s
 
 
@@ -166,12 +167,23 @@ object CompanyDynamic {
       spark.createDataFrame(rdd, schema)
         .createOrReplaceTempView(s"company_dynamic_tmp$tableName")
 
-      val cols = getColumns(s"$project.$targetTab").filter(!_.equals("ds")).filter(!_.equals("tn"))
-
+//      val cols = getColumns(s"$project.$targetTab").filter(!_.equals("ds")).filter(!_.equals("tn"))
+      unescapeHtml4()
       sql(
         s"""
            |INSERT ${if (isWindows) "INTO" else "OVERWRITE"} TABLE ${getEnvProjectName(env, project)}.$targetTab PARTITION(ds='$ds',tn='$tableName')
-           |SELECT ${cols.mkString(",")}
+           |SELECT  id
+           |        , cid
+           |        , cname
+           |        , info_type
+           |        , unescapeHtml4(rta_desc) rta_desc
+           |        , unescapeHtml4(change_content) change_content
+           |        , change_time
+           |        , biz_id
+           |        , sub_info_type
+           |        , info_risk_level
+           |        , winhc_suggest
+           |        , create_time
            |FROM
            |    company_dynamic_tmp$tableName
            |WHERE id IS NOT NULL

+ 13 - 5
src/main/scala/com/winhc/bigdata/spark/udf/BaseFunc.scala

@@ -21,11 +21,11 @@ trait BaseFunc {
   private val pattern = "[^\\u4e00-\\u9fa5a-zA-Z \\(\\)().]+".r
 
 
- /* def to_epoch_millis_timestamp(): Unit = {
-    spark.udf.register("to_epoch_millis_timestamp", (et: String) => {
-      DateUtils.toUnixTimestamp(date = et) * 1000 + 28800000L
-    })
-  }*/
+  /* def to_epoch_millis_timestamp(): Unit = {
+     spark.udf.register("to_epoch_millis_timestamp", (et: String) => {
+       DateUtils.toUnixTimestamp(date = et) * 1000 + 28800000L
+     })
+   }*/
 
   def code2Name(): (Broadcast[Map[String, Seq[String]]], Broadcast[Map[String, Seq[String]]]) = {
     val categoryCode2Name = spark.sparkContext.broadcast(spark.sql(
@@ -70,6 +70,14 @@ trait BaseFunc {
     (categoryCode2Name, areaCode2Name)
   }
 
+  def unescapeHtml4(): Unit = {
+    //清理html字符
+    spark.udf.register("unescapeHtml4", (col: String) => {
+      import org.apache.commons.lang3.StringEscapeUtils
+      StringEscapeUtils.unescapeHtml4(col)
+    })
+  }
+
   def cleanup(): Unit = {
     //清理特殊字符
     spark.udf.register("cleanup", (col: String) => {