浏览代码

动态提到,pom更新

许家凯 4 年之前
父节点
当前提交
1177b509e8

+ 25 - 0
pom.xml

@@ -77,6 +77,26 @@
                     <groupId>org.scala-lang</groupId>
                     <artifactId>scalap</artifactId>
                 </exclusion>
+                <exclusion>
+                    <artifactId>hadoop-yarn-api</artifactId>
+                    <groupId>org.apache.hadoop</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>hadoop-yarn-server-common</artifactId>
+                    <groupId>org.apache.hadoop</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>hadoop-yarn-client</artifactId>
+                    <groupId>org.apache.hadoop</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>hadoop-yarn-common</artifactId>
+                    <groupId>org.apache.hadoop</groupId>
+                </exclusion>
+                <exclusion>
+                    <artifactId>hadoop-mapreduce-client-common</artifactId>
+                    <groupId>org.apache.hadoop</groupId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
@@ -112,6 +132,7 @@
                     <groupId>com.aliyun.odps</groupId>
                 </exclusion>
             </exclusions>
+            <scope>provided</scope>
         </dependency>
         <dependency>
             <groupId>com.aliyun.odps</groupId>
@@ -163,6 +184,10 @@
                     <groupId>org.apache.spark</groupId>
                     <artifactId>spark-catalyst_2.11</artifactId>
                 </exclusion>
+                <exclusion>
+                    <artifactId>hadoop-yarn-common</artifactId>
+                    <groupId>org.apache.hadoop</groupId>
+                </exclusion>
             </exclusions>
         </dependency>
         <!-- https://mvnrepository.com/artifact/com.aliyun.hbase/alihbase-client -->

+ 5 - 12
src/main/scala/com/winhc/bigdata/spark/jobs/CompanyIncCompany2Es.scala

@@ -93,13 +93,6 @@ object CompanyIncCompany2Es {
         .seq
 
       //读取数据
-      /* val df = sql(
-         s"""
-            |SELECT  ${companyCols.mkString(",")}
-            |FROM    $project.inc_ods_company
-            |WHERE   ds > $start_partition and ds <= $end_partition
-            |""".stripMargin)   */
-
       // 去除数据本身重复
       val df = sql(
         s"""
@@ -110,13 +103,13 @@ object CompanyIncCompany2Es {
            |            FROM    (
            |                        SELECT  *
            |                        FROM    $project.inc_ods_company
-           |                        WHERE   ds > $start_partition and ds <= $end_partition
+           |                        WHERE   ds > $start_partition and ds <= $end_partition and cid is not null
            |                    ) as a
            |        ) AS tmp
            |WHERE   tmp.c = 1
            |""".stripMargin)
 
-      df.createOrReplaceTempView("tmp_company_inc")
+      df.cache().createOrReplaceTempView("tmp_company_inc")
 
       //写出到ads
       sql(
@@ -143,8 +136,8 @@ object CompanyIncCompany2Es {
         }
         (new ImmutableBytesWritable, put)
       }).filter(_ != null)
-        .saveAsNewAPIHadoopDataset(jobConf)
-      //        .saveAsHadoopDataset(jobConf)
+//        .saveAsNewAPIHadoopDataset(jobConf)
+              .saveAsHadoopDataset(jobConf)
 
       //写出到es
       import com.winhc.bigdata.spark.utils.CompanyEsUtils.getEsDoc
@@ -171,7 +164,7 @@ object CompanyIncCompany2Es {
 
     val config = EsConfig.getEsConfigMap ++ mutable.Map(
       "spark.hadoop.odps.project.name" -> project,
-      "spark.hadoop.odps.spark.local.partition.amt" -> "10"
+      "spark.hadoop.odps.spark.local.partition.amt" -> "2"
     )
 
     val spark = SparkUtils.InitEnv("company2Es", config)

+ 137 - 0
src/main/scala/com/winhc/bigdata/spark/jobs/chance/ChangeExtract.scala

@@ -0,0 +1,137 @@
+package com.winhc.bigdata.spark.jobs.chance
+
+import com.winhc.bigdata.spark.config.EsConfig
+import com.winhc.bigdata.spark.utils.BaseUtil.{cleanup, isWindows}
+import com.winhc.bigdata.spark.utils.ChangeExtractUtils.getCurrentMap
+import com.winhc.bigdata.spark.utils.{BaseUtil, LoggingUtils, SparkUtils}
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.types.{MapType, StringType, StructField, StructType}
+import org.apache.spark.sql.{Row, SparkSession}
+
+import scala.annotation.meta.getter
+import scala.collection.mutable
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/7/7 11:25
+ * @Description: 筛选出数据的具体变更
+ */
+object ChangeExtract {
+
+  case class ChangeExtractUtils(s: SparkSession,
+                                project: String, //表所在工程名
+                                tableName: String, //表名(不加前后辍)
+                                primaryKey: String, //此维度主键
+                                inc_ds: String, //需要计算的分区
+                                primaryFields: Seq[String], //主要字段,该字段任意一个不同 则认为发生变化
+                                label: (Map[String, String], Map[String, String]) => String // 去重列
+                               ) extends LoggingUtils {
+    @(transient@getter) val spark: SparkSession = s
+
+    def calc(): Unit = {
+      val cols = primaryFields.filter(!_.equals(primaryKey)).seq
+
+      val ds = inc_ds.replace("-", "")
+
+      val all_cols = primaryKey +: cols :+ "change_flag"
+
+      val lastDs_ads_all = getLastPartitionsOrElse(s"$project.ads_$tableName", "0")
+
+      val intersectCols = getColumns(s"$project.ads_$tableName").toSet & getColumns(s"$project.inc_ads_$tableName").toSet
+
+      val rdd = sql(
+        s"""
+           |SELECT  $primaryKey,${cols.mkString(",")},'0' as change_flag
+           |FROM    $project.inc_ads_$tableName
+           |WHERE   ds = $ds
+           |UNION ALL
+           |SELECT  t2.$primaryKey,${cols.map("t2." + _).mkString(",")},'1' as change_flag
+           |FROM    (
+           |            SELECT  DISTINCT ${primaryKey}
+           |            FROM    $project.inc_ads_$tableName
+           |            WHERE   ds = $ds
+           |        ) AS t1
+           |JOIN    (
+           |             SELECT  tmp.*
+           |             FROM    (
+           |                         SELECT  a.*
+           |                                 ,row_number() OVER (PARTITION BY a.${primaryKey} ORDER BY update_time DESC) c
+           |                         FROM    (
+           |                                     SELECT  ${intersectCols.mkString(",")}
+           |                                     FROM    $project.ads_$tableName
+           |                                     WHERE   ds = $lastDs_ads_all
+           |                                     UNION ALL
+           |                                     SELECT  ${intersectCols.mkString(",")}
+           |                                     FROM    $project.inc_ads_$tableName
+           |                                     WHERE   ds > $lastDs_ads_all and ds < $ds
+           |                                 ) AS a
+           |                     ) AS tmp
+           |             WHERE   tmp.c = 1
+           |        ) AS t2
+           |ON      t1.${primaryKey} = t2.${primaryKey}
+           |""".stripMargin)
+        .select(all_cols.map(column => col(column).cast("string")): _*)
+        .rdd.map(r => {
+        (r.getAs[String](primaryKey), all_cols.map(f => (f, r.getAs[String](f))).toMap)
+      }).groupByKey()
+        .map(x => {
+          val rowkey = x._1
+          val map_list = x._2
+          if (map_list.size == 1) {
+            Row(rowkey, "insert", map_list.head, "新增")
+          } else {
+            if (map_list.size > 2) {
+              logger.error("list.size greater than 2! rowkey:" + rowkey)
+            }
+            val m = getCurrentMap(map_list)
+
+            val new_map = m._1
+            val old_map = m._2
+            val tmp = cols.map(f => {
+              (f, cleanup(new_map(f)).equals(cleanup(old_map(f))))
+            })
+            val eq = tmp.map(_._2).reduce((a1, a2) => a1 && a2)
+
+            if (eq) {
+              null
+            } else {
+              Row(rowkey, "update", new_map, s"更新字段:${tmp.filter(!_._2).map(_._1).mkString(",")}")
+            }
+          }
+        }).filter(_ != null)
+
+      val schema = StructType(Array(
+        StructField("rowkey", StringType),
+        StructField("type", StringType),
+        StructField("data", MapType(StringType, StringType)),
+        StructField("label", StringType)))
+
+      val df = spark.createDataFrame(rdd, schema) //
+
+      df.write
+        .mode(if (isWindows) "append" else "overwrite")
+        .insertInto(s"${project}.tmp_xjk_icp_change")
+    }
+  }
+
+
+  // winhc_eci_dev company cid 20200630 legal_entity_id,reg_location,business_scope,reg_status,reg_capital,emails,phones
+  def main(args: Array[String]): Unit = {
+    val Array(project, tableName, rowkey, inc_ds, pf) = args
+
+    def label(m1: Map[String, String], m2: Map[String, String]): String = {
+      ""
+    }
+
+    val config = EsConfig.getEsConfigMap ++ mutable.Map(
+      "spark.hadoop.odps.project.name" -> project,
+      "spark.hadoop.odps.spark.local.partition.amt" -> "10"
+    )
+
+    val spark = SparkUtils.InitEnv("ChangeExtract", config)
+
+    ChangeExtractUtils(spark, project, tableName, rowkey, inc_ds, pf.split(","), label).calc
+    spark.stop()
+  }
+
+}

+ 10 - 0
src/main/scala/com/winhc/bigdata/spark/test/TestCatalog.scala

@@ -0,0 +1,10 @@
+package com.winhc.bigdata.spark.test
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/7/4 10:01
+ * @Description:
+ */
+object TestCatalog {
+
+}

+ 10 - 0
src/main/scala/com/winhc/bigdata/spark/test/addMd5Sql.scala

@@ -0,0 +1,10 @@
+package com.winhc.bigdata.spark.test
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/7/3 08:53
+ * @Description:
+ */
+object addMd5Sql {
+
+}

+ 13 - 0
src/main/scala/com/winhc/bigdata/spark/utils/BaseUtil.scala

@@ -2,6 +2,7 @@ package com.winhc.bigdata.spark.utils
 
 import java.util.{Calendar, Date, Locale}
 
+import org.apache.commons.lang3.StringUtils
 import org.apache.commons.lang3.time.DateFormatUtils
 import org.apache.spark.sql.SparkSession
 
@@ -11,6 +12,17 @@ import org.apache.spark.sql.SparkSession
  * @Description:
  */
 object BaseUtil {
+  //去其他符号 去空格
+  private val pattern = "[^\\u4e00-\\u9fa50-9a-zA-Z]".r
+
+  //去其他符号 去空格  补全null
+  def cleanup(s: String): String = {
+    if (StringUtils.isBlank(s))
+      ""
+    else
+      pattern replaceAllIn(s, "")
+  }
+
   def isWindows: Boolean = System.getProperty("os.name").contains("Windows")
 
   def getPartitions(t: String, @transient spark: SparkSession): Seq[String] = {
@@ -19,6 +31,7 @@ object BaseUtil {
     sql(sql_s).collect.toList.map(_.getString(0).split("=")(1)).seq
   }
 
+
   def getPartion(t: String, @transient spark: SparkSession) = {
     val ps = getPartitions(t, spark)
     if (ps.size > 0) {

+ 33 - 6
src/main/scala/com/winhc/bigdata/spark/utils/LoggingUtils.scala

@@ -2,6 +2,7 @@ package com.winhc.bigdata.spark.utils
 
 import java.io.PrintWriter
 
+import com.winhc.bigdata.spark.utils.BaseUtil.getPartitions
 import org.apache.commons.lang3.StringUtils
 import org.apache.log4j.Logger
 import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -30,12 +31,12 @@ trait LoggingUtils {
     )
     println(
       s"""
-        |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-        |Job[${this.getClass.getSimpleName}].SQL[No$sqlNo.]
-        |
-        |$sqlString
-        |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-        |""".stripMargin)
+         |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+         |Job[${this.getClass.getSimpleName}].SQL[No$sqlNo.]
+         |
+         |$sqlString
+         |- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+         |""".stripMargin)
     sqlNo += 1
     spark.sql(sqlString)
   }
@@ -117,4 +118,30 @@ trait LoggingUtils {
     sb.toString()
   }
 
+
+  def getPartitions(t: String): Seq[String] = {
+    val sql_s = s"show partitions " + t
+    sql(sql_s).collect.toList.map(_.getString(0).split("=")(1)).seq
+  }
+
+  def getLastPartitionsOrElse(t: String, default: String): String = {
+    val ps = getPartitions(t)
+    if (ps.nonEmpty) {
+      ps.last
+    } else {
+      default
+    }
+  }
+ def getHeadPartitionsOrElse(t: String, default: String): String = {
+    val ps = getPartitions(t)
+    if (ps.nonEmpty) {
+      ps.head
+    } else {
+      default
+    }
+  }
+
+  def getColumns(t:String): Seq[String] ={
+    spark.table(t).columns.seq
+  }
 }