许家凯 4 years ago
parent
commit
0fd6f59d2d

+ 10 - 0
src/main/scala/com/winhc/bigdata/spark/const/EnvConst.scala

@@ -0,0 +1,10 @@
+package com.winhc.bigdata.spark.const
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/6/9 14:24
+ * @Description:
+ */
+object EnvConst {
+
+}

+ 23 - 8
src/main/scala/com/winhc/bigdata/spark/jobs/CompanyIndexSave2Es.scala

@@ -3,6 +3,8 @@ package com.winhc.bigdata.spark.jobs
 import com.winhc.bigdata.spark.utils.{EsUtils, SparkUtils}
 import org.apache.commons.lang3.StringUtils
 
+import scala.collection.JavaConverters._
+
 /**
  * @Author: XuJiakai
  * @Date: 2020/6/5 14:28
@@ -13,9 +15,9 @@ object CompanyIndexSave2Es {
 
   case class CompanyName(show: String, value: String)
 
-  case class CompanyDoc(cname: CompanyName, current_id: String = null, history_name: Seq[CompanyName] = null)
+  case class CompanyDoc(cname: CompanyName, current_id: String = null, history_name: Seq[CompanyName] = null, company_type: String)
 
-  def getEsDoc(cid: String, cname: String, other_id_name: scala.collection.Map[String, String], new_cid: String): (String, CompanyDoc) = {
+  def getEsDoc(cid: String, cname: String, other_id_name: scala.collection.Map[String, String], new_cid: String, company_type: String): (String, CompanyDoc) = {
     var history_name: Seq[CompanyName] = null
     if (other_id_name != null) {
       history_name = other_id_name
@@ -28,7 +30,17 @@ object CompanyIndexSave2Es {
         history_name = null
       }
     }
-    (cid, CompanyDoc(getCompanyName(cname), if (cid.equals(new_cid)) null else new_cid, history_name))
+    (cid, CompanyDoc(getCompanyName(cname), if (cid.equals(new_cid)) null else new_cid, history_name, company_type))
+  }
+
+  private def getOtherIdName(str: String): scala.collection.Map[String, String] = {
+    if (str == null) {
+      return null
+    }
+    str.split("\002").map(s => {
+      val sp = s.split("\001")
+      (sp(0), sp(1))
+    }).toMap
   }
 
 
@@ -37,17 +49,20 @@ object CompanyIndexSave2Es {
   def main(args: Array[String]): Unit = {
     val map = EsUtils.getEsConfigMap
 
+    val company_name_mapping = "winhc_eci_dev.company_name_mapping_pro_v2"
+
     val spark = SparkUtils.InitEnv("CompanyIndexSave2Es", map)
     import org.elasticsearch.spark._
     import spark.implicits._
-    val df = spark.sql("select cid,cname,other_id_name,new_cid from company_name_mapping_pro")
+    val df = spark.sql(s"select cid,cname,other_id_name,new_cid,company_type from $company_name_mapping")
     df.map(r => {
-      val cid = r.getString(0)
+      val cid = r.getLong(0).toString
       val cname = r.getString(1)
-      val other_id_name = r.getMap[String, String](2)
+      val other_id_name = getOtherIdName(r.getString(2))
       val new_cid = r.getString(3)
-      getEsDoc(cid, cname, other_id_name, new_cid)
-    }).rdd.saveToEsWithMeta("winhc-company/company")
+      val company_type = r.getString(4)
+      getEsDoc(cid, cname, other_id_name, new_cid, company_type)
+    }).rdd.saveToEsWithMeta("winhc-company-v2/company")
     spark.stop()
   }
 }

+ 45 - 61
src/main/scala/com/winhc/bigdata/spark/jobs/CompanyNameMappingPro.scala

@@ -1,7 +1,7 @@
 package com.winhc.bigdata.spark.jobs
 
 import com.aliyun.odps.utils.StringUtils
-import com.winhc.bigdata.spark.utils.{HBaseUtils, SparkUtils}
+import com.winhc.bigdata.spark.utils.{CompanyNameMappingUtil, HBaseUtils, SparkUtils}
 import org.apache.hadoop.hbase.client.{Get, Put, Table}
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable
 import org.apache.hadoop.hbase.spark.HBaseContext
@@ -25,67 +25,40 @@ object CompanyNameMappingPro extends Logging {
   val cname_bytes: Array[Byte] = Bytes.toBytes("name")
   val current_cid_bytes: Array[Byte] = Bytes.toBytes("current_cid")
 
-  private def getCurrentIdAndName(table: Table, cid: String): (String, String) = {
-    var current_cid: String = cid
-    var name: String = null
-    val mutableSet = Set(cid)
-    while (true) {
-      val result = table.get(new Get(Bytes.toBytes(current_cid)))
-      if (result.isEmpty) {
-        return (null, null)
-      }
-      val tmp_name = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, cname_bytes) get (0)))
-      var tmp_current_cid: String = null
-      try {
-        tmp_current_cid = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, current_cid_bytes) get (0)))
-      } catch {
-        case e: Exception => {
-        }
-      }
-      if (StringUtils.isEmpty(tmp_current_cid)) {
-        return (current_cid, tmp_name)
-      }
-
-      if (mutableSet.contains(tmp_current_cid)) {
-        return (null, null)
-      }
-      mutableSet += tmp_current_cid
-      current_cid = tmp_current_cid
-      name = tmp_name
-    }
-    (current_cid, name)
-  }
-
-
   def main(args: Array[String]): Unit = {
-
-    import com.winhc.bigdata.spark.utils.BaseUtil.getExecutorConfigOrExit
-    val map = getExecutorConfigOrExit(args)
+    val map = mutable.Map[String, String](
+      "spark.hadoop.odps.spark.local.partition.amt" -> "10"
+    )
+//    import com.winhc.bigdata.spark.utils.BaseUtil.getExecutorConfigOrExit
+//    val map = getExecutorConfigOrExit(args)
 
     val hbaseKVTable = "company_name_kv"
-    val inputTable = "new_ods_company"
-    val resultTable = "company_name_mapping_pro"
-    val tmpResultTable = "company_name_mapping_pro_tmp"
+    val inputTable = "ods_company"
+    val resultTable = "company_name_mapping_pro_v2"
+    val tmpResultTable = "company_name_mapping_pro_tmp_2"
 
 
-    val spark = SparkUtils.InitEnv("CompanyNameMapping", map)
+    val spark = SparkUtils.InitEnv("CompanyNameMapping",map)
     import spark._
     /**
      * 写hbase,供查询
      */
     val jobConf = HBaseUtils.HBaseOutputJobConf(hbaseKVTable)
 
-     val df = sql(s"select cid,name,current_cid from $inputTable")
-     df.rdd.map(row => {
-       val id = row(0).asInstanceOf[Long].toString
-       val name = row(1).asInstanceOf[String]
-       val current_cid = row(2).asInstanceOf[Long].toString
-       val put = new Put(Bytes.toBytes(id))
-       put.addColumn(f_bytes, cname_bytes, Bytes.toBytes(name))
-       if (!"0".equals(current_cid))
-         put.addColumn(f_bytes, current_cid_bytes, Bytes.toBytes(current_cid))
-       (new ImmutableBytesWritable, put)
-     }).saveAsHadoopDataset(jobConf)
+    /*  val df = sql(s"select cid,name,current_cid from $inputTable where cid is not null")
+      df.rdd.map(row => {
+        val id = row(0).asInstanceOf[Long].toString
+        val name = row(1).asInstanceOf[String]
+        val current_cid = row(2).asInstanceOf[Long].toString
+        val put = new Put(Bytes.toBytes(id))
+        if (name != null) {
+          put.addColumn(f_bytes, cname_bytes, Bytes.toBytes(name))
+        }
+        if (!"0".equals(current_cid)) {
+          put.addColumn(f_bytes, current_cid_bytes, Bytes.toBytes(current_cid))
+        }
+        (new ImmutableBytesWritable, put)
+      }).saveAsHadoopDataset(jobConf)*/
 
     logInfo("save hbase success!")
 
@@ -97,10 +70,9 @@ object CompanyNameMappingPro extends Logging {
          |        ,current_cid
          |FROM    $inputTable
          |WHERE   current_cid IS NOT NULL AND cid IS NOT NULL
-         |""".stripMargin)
+         |""".stripMargin).repartition(500)
 
     val hbaseContext = new HBaseContext(spark.sparkContext, jobConf)
-
     /**
      * 查hbase,找到最新的公司id
      */
@@ -111,7 +83,7 @@ object CompanyNameMappingPro extends Logging {
           val cid = record.getLong(0).toString
           val name = record.getString(1)
           val current_cid = record.getAs[Long](2).toString
-          val (res_cid, res_name) = getCurrentIdAndName(table, current_cid)
+          val (res_cid, res_name) = CompanyNameMappingUtil.getCurrentIdAndName(table, current_cid)
           Row(cid, name, current_cid, res_cid, res_name)
         } catch {
           case e: Exception => {
@@ -121,7 +93,7 @@ object CompanyNameMappingPro extends Logging {
             null
         }
       })
-      table.close()
+//      table.close()
       rdd_par
     }).filter(_ != null)
 
@@ -134,7 +106,8 @@ object CompanyNameMappingPro extends Logging {
 
     val tmp_df = spark.createDataFrame(tmp_rdd, schema)
 
-    tmp_df.createTempView(tmpResultTable) //注册临时表
+    tmp_df.write.mode("overwrite").insertInto(tmpResultTable)
+//        tmp_df.createTempView(tmpResultTable) //注册临时表
 
     logInfo("new_cid add success")
 
@@ -153,14 +126,26 @@ object CompanyNameMappingPro extends Logging {
          |GROUP BY new_cid
          |""".stripMargin)
       .flatMap(r => {
-        val new_cid = r.getAs[String]("new_cid")
-        val other_id_name = r.getAs[String]("other_id_name")
-        val other_id_name_map = other_id_name.split(";").map(str => (str.split(":")(0)) -> str.split(":")(1)).toMap
+        try {
+          val new_cid = r.getAs[String]("new_cid")
+          val other_id_name = r.getAs[String]("other_id_name")
+          val other_id_name_map = other_id_name.split(";").map(str => (str.split(":")(0)) -> str.split(":")(1)).toMap
 
-        other_id_name.split(";").map(str => (str.split(":")(0), other_id_name_map)).:+((new_cid, other_id_name_map)).toSeq
+          other_id_name
+            .split(";")
+            .map(str => (str.split(":")(0), other_id_name_map)).:+((new_cid, other_id_name_map)).toSeq
+        } catch {
+          case e: Exception => {
+            logError(e.getMessage)
+          }
+            logError(r.getAs[String]("new_cid"))
+            logError(r.getAs[String]("other_id_name"))
+            null
+        }
       }).filter(_ != null).toDF("cid", "other_id_name")
 
     id_other.printSchema()
+    id_other.show()
 
     /**
      * 补全数据,加入未发生过名称变更的公司
@@ -186,7 +171,6 @@ object CompanyNameMappingPro extends Logging {
       tmp_df3.join(id_other, Seq("cid"), "left").select(
         "cid"
         , "cname"
-        //        , s"current_cid"
         , "other_id_name"
         , "new_cid"
         , "new_cname"

+ 96 - 0
src/main/scala/com/winhc/bigdata/spark/jobs/CompanyNameMappingPro_stage_01.scala

@@ -0,0 +1,96 @@
+package com.winhc.bigdata.spark.jobs
+
+import com.winhc.bigdata.spark.utils.{CompanyNameMappingUtil, HBaseUtils, SparkUtils}
+import org.apache.hadoop.hbase.TableName
+import org.apache.hadoop.hbase.spark.HBaseContext
+import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
+import org.apache.hadoop.hbase.util.Bytes
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/5/26 10:41
+ * @Description: 公司名称映射关系计算  只获取最新的公司id
+ */
+object CompanyNameMappingPro_stage_01 extends Logging {
+  val f_bytes: Array[Byte] = Bytes.toBytes("f")
+  val cname_bytes: Array[Byte] = Bytes.toBytes("name")
+  val current_cid_bytes: Array[Byte] = Bytes.toBytes("current_cid")
+
+  def main(args: Array[String]): Unit = {
+    val inputTable = "xjk_company_name_mapping_tmp"
+    val hbaseKVTable = "company_name_kv"
+
+    val spark = SparkUtils.InitEnv("CompanyNameMappingPro_stage_01")
+    import spark._
+    /**
+     * 写hbase,供查询
+     */
+    val jobConf = HBaseUtils.HBaseOutputJobConf(hbaseKVTable)
+
+    val df_old = sql(
+      s"""
+         |SELECT  cid
+         |        ,cname
+         |        ,current_cid
+         |        ,new_cid
+         |        ,new_cname
+         |        ,flag
+         |FROM    $inputTable
+         |WHERE new_cid is null
+         |""".stripMargin).repartition(50)
+
+    val hbaseContext = new HBaseContext(spark.sparkContext, jobConf)
+    /**
+     * 查hbase,找到最新的公司id
+     */
+    val tmp_rdd = df_old.rdd.hbaseMapPartitions(hbaseContext, (f, con) => {
+      val table = con.getTable(TableName.valueOf(hbaseKVTable))
+      val rdd_par = f.map(record => {
+        val cid = record.getString(0)
+        val name = record.getString(1)
+        val current_cid = record.getAs[String](2)
+        try {
+          val (res_cid, res_name) = CompanyNameMappingUtil.getCurrentIdAndName(table, current_cid)
+          Row(cid, name, current_cid, res_cid, res_name, "0")
+        } catch {
+          case e: Exception => {
+            logWarning(record.toString())
+            logError(e.getMessage, e)
+          }
+            Row(cid, name, current_cid, null, null, "-1")
+        }
+      })
+      rdd_par
+    }).filter(_ != null)
+
+    val schema = StructType(Array(
+      StructField("cid", StringType),
+      StructField("cname", StringType),
+      StructField("current_cid", StringType),
+      StructField("new_cid", StringType),
+      StructField("new_cname", StringType),
+      StructField("flag", StringType)
+    ))
+
+    val tmp_df = spark.createDataFrame(tmp_rdd, schema)
+
+    val df = sql(
+      """
+        |SELECT  cid
+        |        ,cname
+        |        ,current_cid
+        |        ,new_cid
+        |        ,new_cname
+        |        ,flag
+        |FROM    xjk_company_name_mapping_tmp
+        |WHERE new_cid is not null
+        |""".stripMargin)
+    tmp_df.union(df).write.mode("overwrite").insertInto(inputTable)
+
+    logInfo("CompanyNameMapping success")
+    spark.stop()
+  }
+}

+ 0 - 3
src/main/scala/com/winhc/bigdata/spark/test/TestSpark2Hbase.scala

@@ -21,13 +21,10 @@ object TestSpark2Hbase extends Logging {
   def main(args: Array[String]): Unit = {
     val map = mutable.Map[String, String](
     )
-
     val hbaseKVTable = "company_name_kv_tmp"
-    val inputTable = "new_ods_company"
 
     val spark = SparkUtils.InitEnv("TestSpark2Hbase", map)
     import spark._
-    val df = sql(s"select cid,name,current_cid from $inputTable")
 
     val jobConf = HBaseUtils.HBaseOutputJobConf(hbaseKVTable)
     val df1 = spark.createDataFrame(Seq(("1", "2", "3"))).toDF("col0", "col1", "col2")

+ 50 - 0
src/main/scala/com/winhc/bigdata/spark/test/TestSpark4Hbase.scala

@@ -0,0 +1,50 @@
+package com.winhc.bigdata.spark.test
+
+import com.winhc.bigdata.spark.utils.{CompanyNameMappingUtil, HBaseUtils, SparkUtils}
+import org.apache.hadoop.hbase.TableName
+import org.apache.hadoop.hbase.spark.HBaseContext
+import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/6/10 17:09
+ * @Description:
+ */
+object TestSpark4Hbase {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkUtils.InitEnv("4hbase")
+    import spark.implicits._
+    val hbaseKVTable = "company_name_kv"
+    val jobConf = HBaseUtils.HBaseOutputJobConf(hbaseKVTable)
+    val hbaseContext = new HBaseContext(spark.sparkContext, jobConf)
+    val rdd = spark.createDataset(Seq("3190263436", "3190295172", "3190295172")).rdd.hbaseMapPartitions(hbaseContext, (f, con) => {
+      val table = con.getTable(TableName.valueOf(hbaseKVTable))
+      val rdd_par = f.map(record => {
+        try {
+          val cid = record
+          val (res_cid, res_name) = CompanyNameMappingUtil.getCurrentIdAndName(table, cid)
+          Row(cid, null, null, res_cid, res_name)
+        } catch {
+          case e: Exception => {
+            println(record.toString())
+            println(e.getMessage, e)
+          }
+            null
+        }
+      })
+      rdd_par
+    }).filter(_ != null)
+    val schema = StructType(Array(
+      StructField("cid", StringType),
+      StructField("cname", StringType),
+      StructField("current_cid", StringType),
+      StructField("new_cid", StringType),
+      StructField("new_cname", StringType)))
+    spark.createDataFrame(rdd, schema).show()
+    spark.stop()
+
+  }
+
+}

+ 1 - 39
src/main/scala/com/winhc/bigdata/spark/utils/BaseUtil.scala

@@ -1,7 +1,7 @@
 package com.winhc.bigdata.spark.utils
 
-import scala.collection.mutable
 import java.util.{Calendar, Date, Locale}
+
 import org.apache.commons.lang3.time.DateFormatUtils
 import org.apache.spark.sql.SparkSession
 
@@ -13,44 +13,7 @@ import org.apache.spark.sql.SparkSession
 object BaseUtil {
   def isWindows: Boolean = System.getProperty("os.name").contains("Windows")
 
-  def getExecutorConfigOrExit(args: Array[String]): mutable.Map[String, String] = {
-    if (args.length != 3) {
-      println("请配置计算资源: instances, cores, memory .")
-      sys.exit(-1)
-    }
-    val Array(instances, cores, memory) = args;
-    getExecutorConfig(instances, cores, memory)
-  }
-
-  def getExecutorConfigOrDefault(args: Array[String]): mutable.Map[String, String] = {
-    var instances, cores, memory: String = null
-    if (args.length != 3) {
-      println("使用默认的计算资源: 2 instances, 2 cores, 10g memory.")
-      instances = "2"
-      cores = "2"
-      memory = "10g"
-    } else {
-      instances = args(0)
-      cores = args(1)
-      memory = args(2)
-    }
-    getExecutorConfig(instances, cores, memory)
-  }
-
-  private def getExecutorConfig(instances: String, cores: String, memory: String): mutable.Map[String, String] = {
-    println(
-      s"""
-         |instances : $instances,
-         |cores : $cores,
-         |memory : $memory
-         |""".stripMargin)
-    mutable.Map("spark.executor.instances" -> instances,
-      "spark.executor.cores" -> cores,
-      "spark.executor.memory" -> memory
-    )
-  }
   def getPartion(t: String, @transient spark: SparkSession) = {
-    import spark.implicits._
     import spark._
     val sql_s = s"show partitions " + t
     sql(sql_s).collect.toList.map(_.getString(0).split("=")(1)).last
@@ -62,5 +25,4 @@ object BaseUtil {
     c.add(Calendar.MONTH, -1 * n)
     DateFormatUtils.format(c.getTime.getTime, pattern)
   }
-
 }

+ 77 - 0
src/main/scala/com/winhc/bigdata/spark/utils/CompanyNameMappingUtil.scala

@@ -0,0 +1,77 @@
+package com.winhc.bigdata.spark.utils
+
+import cn.oyohotels.utils.HbaseUtil
+import com.aliyun.odps.utils.StringUtils
+import com.winhc.bigdata.spark.jobs.CompanyNameMappingPro.{cname_bytes, current_cid_bytes, f_bytes}
+import org.apache.hadoop.hbase.CellUtil
+import org.apache.hadoop.hbase.client.{Get, Table}
+import org.apache.hadoop.hbase.util.Bytes
+
+import scala.collection.mutable.Set
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/6/10 16:13
+ * @Description:
+ */
+object CompanyNameMappingUtil {
+
+  def getCurrentIdAndName(table: Table, cid: String): (String, String) = {
+    var current_cid: String = cid
+    var name: String = null
+    val mutableSet = Set(cid)
+    while (true) {
+      val result = table.get(new Get(Bytes.toBytes(current_cid)))
+      if (result.isEmpty) {
+        return (null, null)
+      }
+      val tmp_name = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, cname_bytes) get (0)))
+      var tmp_current_cid: String = null
+      try {
+        tmp_current_cid = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, current_cid_bytes) get (0)))
+      } catch {
+        case e: Exception => {
+        }
+      }
+      if (StringUtils.isEmpty(tmp_current_cid)) {
+        return (current_cid, tmp_name)
+      }
+
+      if (mutableSet.contains(tmp_current_cid)) {
+        println("set contains cid:" + cid)
+        return (null, null)
+      }
+      mutableSet += tmp_current_cid
+      current_cid = tmp_current_cid
+      name = tmp_name
+    }
+    (current_cid, name)
+  }
+
+
+  def main(args: Array[String]): Unit = {
+
+    """
+      |3190263436
+      |3190295172
+      |3190378251
+      |3190391976
+      |3190397579
+      |3190406832
+      |3193723587
+      |3193773044
+      |
+      |""".stripMargin
+    val table = HbaseUtil.getTable("company_name_kv")
+    for (id <- Seq("3190263436",
+      "3190295172",
+      "3190295172",
+      "3190378251",
+      "3190391976",
+      "3190397579",
+      "3190406832",
+      "3193723587",
+      "3193773044"))
+      println(getCurrentIdAndName(table, id))
+  }
+}

+ 4 - 0
src/main/scala/com/winhc/bigdata/spark/utils/CompanySummaryUtils.scala

@@ -7,4 +7,8 @@ package com.winhc.bigdata.spark.utils
  */
 object CompanySummaryUtils {
   def getSummarySql(tableName: String, companyIdFieldName: String) = s"select $companyIdFieldName as company_id,count(1) as ${tableName}_num from $tableName where $companyIdFieldName <>0 group by $companyIdFieldName"
+
+  def main(args: Array[String]): Unit = {
+    println(getSummarySql("abc","ncid"))
+  }
 }

+ 15 - 5
src/main/scala/com/winhc/bigdata/spark/utils/HBaseUtils.scala

@@ -1,6 +1,7 @@
 package com.winhc.bigdata.spark.utils
 
 import com.winhc.bigdata.spark.utils.BaseUtil.isWindows
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hbase.mapred.TableOutputFormat
 import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants}
 import org.apache.hadoop.mapred.JobConf
@@ -11,19 +12,28 @@ import org.apache.hadoop.mapred.JobConf
  * @Description:
  */
 object HBaseUtils {
-  def HBaseOutputJobConf(outputTable: String): JobConf = {
+  def getHbaseConf(): Configuration = {
     val config = HBaseConfiguration.create()
     var zkAddress: String = null
     if (isWindows) {
-      zkAddress = "hb-proxy-pub-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com"
+      zkAddress = "hb-proxy-pub-uf63a7d09rpl8mcvm-001.hbase.rds.aliyuncs.com"
       import com.alibaba.dcm.DnsCacheManipulator
-      DnsCacheManipulator.setDnsCache("hb-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com", "47.101.251.157")
+      DnsCacheManipulator.setDnsCache("hb-uf63a7d09rpl8mcvm-001.hbase.rds.aliyuncs.com", "47.101.250.84")
     } else {
-      zkAddress = "hb-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com"
+      zkAddress = "hb-uf6m8e1nu4ivp06m5-master1-001.hbase.rds.aliyuncs.com,hb-uf6m8e1nu4ivp06m5-master2-001.hbase.rds.aliyuncs.com,hb-uf6m8e1nu4ivp06m5-master3-001.hbase.rds.aliyuncs.com"
+//      zkAddress = "hb-uf63a7d09rpl8mcvm-001.hbase.rds.aliyuncs.com"
+
+      //      zkAddress = "ld-uf6717qu3qh1t80z8-proxy-hbaseue.hbaseue.rds.aliyuncs.com:30020"
+      //      config.set("hbase.client.username", "root");
+      //      config.set("hbase.client.password", "root");
+      //      config.set("hbase.client.connection.impl", "org.apache.hadoop.hbase.client.AliHBaseUEClusterConnection");
     }
     config.set(HConstants.ZOOKEEPER_QUORUM, zkAddress);
+    config
+  }
 
-    val jobConf = new JobConf(config)
+  def HBaseOutputJobConf(outputTable: String): JobConf = {
+    val jobConf = new JobConf(getHbaseConf())
     jobConf.setOutputFormat(classOf[TableOutputFormat])
     jobConf.set(TableOutputFormat.OUTPUT_TABLE, outputTable)
     jobConf

+ 2 - 12
src/main/scala/com/winhc/bigdata/spark/utils/HbaseUtil.scala

@@ -1,16 +1,9 @@
 package cn.oyohotels.utils
 
-import java.util
-import com.alibaba.dcm.DnsCacheManipulator
-import org.apache.hadoop.hbase.client.Delete
+import com.winhc.bigdata.spark.utils.HBaseUtils
 import org.apache.hadoop.hbase._
 import org.apache.hadoop.hbase.client._
-import org.apache.hadoop.hbase.io.ImmutableBytesWritable
-import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableMapReduceUtil, TableOutputFormat}
 import org.apache.hadoop.hbase.util.Bytes
-import org.apache.hadoop.mapreduce.Job
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 import org.slf4j.LoggerFactory
 
 
@@ -22,10 +15,7 @@ object HbaseUtil {
   val lrs = HbaseUtil.getClass.getResource("/").getPath
 
   lazy val conf = {
-    val myConf = HBaseConfiguration.create()
-    myConf.set("hbase.zookeeper.property.clientPort", "2181")
-    myConf.set("hbase.zookeeper.quorum", "hb-proxy-pub-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com")
-    DnsCacheManipulator.setDnsCache("hb-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com", "47.101.251.157")
+    val myConf = HBaseUtils.getHbaseConf()
     myConf
   }
 

+ 1 - 0
src/main/scala/com/winhc/bigdata/spark/utils/SparkUtils.scala

@@ -19,6 +19,7 @@ object SparkUtils {
       .config("spark.sql.crossJoin.enabled", true)
       .config("spark.hadoop.odps.cupid.smartnat.enable", true)
       .config("odps.exec.dynamic.partition.mode", "nonstrict")
+//      .config("spark.hadoop.odps.project.name", "winhc_eci_dev")
       .config("spark.hadoop.odps.project.name", "winhc_test_dev")
       .config("spark.hadoop.odps.access.id", "LTAI4G4n7pAW8tUbJVkkZQPD")
       .config("spark.hadoop.odps.access.key", "uNJOBskzcDqHq1TYG3m2rebR4c1009")