浏览代码

公司名称映射关系计算

许家凯 4 年之前
父节点
当前提交
de4438f583

+ 12 - 14
pom.xml

@@ -101,9 +101,9 @@
             <version>${cupid.sdk.version}</version>
         </dependency>
         <dependency>
-            <groupId>org.mongodb</groupId>
-            <artifactId>mongo-java-driver</artifactId>
-            <version>3.4.2</version>
+            <groupId>org.apache.hbase.connectors.spark</groupId>
+            <artifactId>hbase-spark</artifactId>
+            <version>1.0.0</version>
         </dependency>
     </dependencies>
 
@@ -123,21 +123,19 @@
                             <minimizeJar>false</minimizeJar>
                             <shadedArtifactAttached>true</shadedArtifactAttached>
                             <artifactSet>
-<!--                                <excludes>-->
-<!--                                    <exclude>jmock:*</exclude>-->
-<!--                                    <exclude>*:xml-apis</exclude>-->
-<!--                                    <exclude>org.apache.maven:lib:tests</exclude>-->
-<!--                                    <exclude>log4j:log4j:jar:</exclude>-->
-<!--                                </excludes>-->
+                                <!--                                <excludes>-->
+                                <!--                                    <exclude>jmock:*</exclude>-->
+                                <!--                                    <exclude>*:xml-apis</exclude>-->
+                                <!--                                    <exclude>org.apache.maven:lib:tests</exclude>-->
+                                <!--                                    <exclude>log4j:log4j:jar:</exclude>-->
+                                <!--                                </excludes>-->
                                 <includes>
-<!--                                    <include>*:*</include>-->
+                                    <!--                                    <include>*:*</include>-->
                                     <include>cn.hutool:*</include>
-                                    <include>org.mongodb:*</include>
-                                    <include>org.mongodb.spark:*</include>
                                     <include>com.aliyun.odps:*</include>
                                     <include>org.mongodb.*:*</include>
-                                    <include>org.mongodb:mongo-java-driver</include>
-<!--                                    <include>com.aliyun.odps:odps-spark-datasource_2.11:*</include>-->
+                                    <include>org.apache.hbase:*</include>
+                                    <!--                                    <include>com.aliyun.odps:odps-spark-datasource_2.11:*</include>-->
                                 </includes>
                             </artifactSet>
                             <filters>

+ 126 - 0
src/main/scala/com/winhc/bigdata/spark/jobs/CompanyNameMapping.scala

@@ -0,0 +1,126 @@
+package com.winhc.bigdata.spark.jobs
+
+import com.aliyun.odps.utils.StringUtils
+import com.winhc.bigdata.spark.utils.SparkUtils
+import org.apache.hadoop.hbase.client.{Get, Put, Table}
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.spark.HBaseContext
+import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
+import org.apache.hadoop.hbase.util.Bytes
+import org.apache.hadoop.hbase.{CellUtil, TableName}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+import scala.collection.mutable
+import scala.collection.mutable.Set
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/5/26 10:41
+ * @Description: 公司名称映射关系计算
+ */
+object CompanyNameMapping extends Logging {
+  val f_bytes: Array[Byte] = Bytes.toBytes("f")
+  val name_bytes: Array[Byte] = Bytes.toBytes("name")
+  val current_cid_bytes: Array[Byte] = Bytes.toBytes("current_cid")
+
+
+  def getCurrentIdAndName(table: Table, cid: String): (String, String) = {
+    var current_cid: String = cid
+    var name: String = null
+    val mutableSet = Set(cid)
+    while (true) {
+      val result = table.get(new Get(Bytes.toBytes(current_cid)))
+
+      if (result.isEmpty) {
+        return (null, null)
+      }
+      val tmp_name = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, name_bytes) get (0)))
+      var tmp_current_cid: String = null
+      try {
+        tmp_current_cid = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, current_cid_bytes) get (0)))
+      } catch {
+        case e: Exception => {
+        }
+      }
+      if (StringUtils.isEmpty(tmp_current_cid)) {
+        return (current_cid, tmp_name)
+      }
+
+      if (mutableSet.contains(tmp_current_cid)) {
+        return (null, null)
+      }
+      mutableSet += tmp_current_cid
+      current_cid = tmp_current_cid
+      name = tmp_name
+    }
+    (current_cid, name)
+  }
+
+
+  def main(args: Array[String]): Unit = {
+    val map = mutable.Map[String, String](
+      "spark.hadoop.odps.cupid.vpc.domain.list" -> "{\"regionId\":\"cn-shanghai\",\"vpcs\":[{\"vpcId\":\"vpc-11hby9xee\",\"zones\":[{\"urls\":[{\"domain\":\"dds-uf6ff5dfd9aef3641.mongodb.rds.aliyuncs.com\",\"port\":3717},{\"domain\":\"dds-uf6ff5dfd9aef3642.mongodb.rds.aliyuncs.com\",\"port\":3717},{\"domain\":\"hb-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com\",\"port\":2181}]}]}]}"
+    )
+
+    val hbaseKVTable = "company_name_kv"
+    val resultTable = "company_name_mapping_2"
+    val inputTable = "new_ods_company"
+
+    val spark = SparkUtils.InitEnv("CompanyNameMapping", map)
+    import spark._
+    val df = sql(s"select cid,name,current_cid from $inputTable")
+
+    val jobConf = SparkUtils.HBaseOutputJobConf(hbaseKVTable)
+    df.rdd.map(row => {
+      val id = row(0).asInstanceOf[Long].toString
+      val name = row(1).asInstanceOf[String]
+      val current_cid = row(2).asInstanceOf[Long].toString
+      val put = new Put(Bytes.toBytes(id))
+      put.addColumn(f_bytes, name_bytes, Bytes.toBytes(name))
+      if (!"0".equals(current_cid))
+        put.addColumn(f_bytes, current_cid_bytes, Bytes.toBytes(current_cid))
+      (new ImmutableBytesWritable, put)
+    }).saveAsHadoopDataset(jobConf)
+
+    logInfo("save hbase success!")
+
+    val df_old = sql(s"select cid,name,current_cid from $inputTable where current_cid is not null")
+
+    val hbaseContext = new HBaseContext(spark.sparkContext, jobConf)
+
+    val res_rdd = df_old.rdd.hbaseMapPartitions(hbaseContext, (f, con) => {
+      val table = con.getTable(TableName.valueOf(hbaseKVTable))
+      val rdd_par = f.map(record => {
+        try {
+          val cid = record.getLong(0).toString
+          val name = record.getString(1)
+          val current_cid = record.getLong(2).toString
+          val (res_cid, res_name) = getCurrentIdAndName(table, current_cid)
+          Row(cid, name, current_cid, res_cid, res_name)
+        } catch {
+          case e: Exception => {
+            logWarning(record.toString())
+            logError(e.getMessage, e)
+          }
+            null
+        }
+      })
+      table.close()
+      rdd_par
+    }).filter(_ != null)
+
+    val schema = StructType(Array(
+      StructField("cid", StringType),
+      StructField("name", StringType),
+      StructField("current_cid", StringType),
+      StructField("res_cid", StringType),
+      StructField("res_name", StringType)))
+
+    val res_df = spark.createDataFrame(res_rdd, schema)
+    res_df.write.mode("append").insertInto(resultTable)
+    logInfo("CompanyNameMapping success")
+    spark.stop()
+  }
+}

+ 3 - 4
src/main/scala/com/winhc/bigdata/spark/jobs/CompanySummaryBySingle.scala

@@ -3,7 +3,7 @@ package com.winhc.bigdata.spark.jobs
 import com.mongodb.spark.MongoSpark
 import com.winhc.bigdata.spark.utils.CompanySummaryUtils._
 import com.winhc.bigdata.spark.utils.SparkUtils
-import org.apache.commons.logging.LogFactory
+import org.apache.spark.internal.Logging
 
 import scala.collection.mutable
 
@@ -12,12 +12,11 @@ import scala.collection.mutable
  * @Date: 2020/5/21 11:18
  * @Description: 单表写入摘要
  */
-object CompanySummaryBySingle {
-  private val log = LogFactory.getLog(this.getClass)
+object CompanySummaryBySingle extends Logging {
 
   def main(args: Array[String]): Unit = {
     if (args.length < 1) {
-      log.error("请输入表名!")
+      logError("请输入表名!")
       sys.exit(-1)
     }
     val tableName = args(0)

+ 0 - 44
src/main/scala/com/winhc/bigdata/spark/summary/ProbidCalculator.scala

@@ -1,44 +0,0 @@
-package com.winhc.bigdata.spark.summary
-
-import com.mongodb.spark.MongoSpark
-import com.winhc.bigdata.spark.jobs.CompanyInfoCalculator.{LOG, prepare}
-import com.winhc.bigdata.spark.utils.SparkUtils
-import org.apache.commons.logging.LogFactory
-import org.apache.spark.sql.{SaveMode, SparkSession}
-import org.apache.spark.sql.SparkSession
-
-object ProbidCalculator {
-  private val LOG = LogFactory.getLog(this.getClass)
-
-  def main(args: Array[String]): Unit = {
-    val database = "itslaw"
-    val collection = "probid_commonpro"
-//    val host = "dds-uf6ff5dfd9aef3641601-pub.mongodb.rds.aliyuncs.com:3717,dds-uf6ff5dfd9aef3642555-pub.mongodb.rds.aliyuncs.com:3717/itslaw?replicaSet=mgset-6501997"
-    val host = "dds-uf6ff5dfd9aef3641.mongodb.rds.aliyuncs.com:3717,dds-uf6ff5dfd9aef3642.mongodb.rds.aliyuncs.com:3717/itslaw?replicaSet=mgset-6501997"
-    val outPutUri = s"mongodb://itslaw:itslaw_168@$host"
-    val spark: SparkSession = SparkUtils.InitEnvRaw("ProbidCalculator")
-      .config("spark.mongodb.input.uri", outPutUri)
-      .config("spark.mongodb.input.collection", collection)
-      .config("spark.mongodb.output.uri", outPutUri)
-      .config("spark.mongodb.output.database",database)
-      .config("spark.mongodb.output.collection",collection)
-      .config("spark.hadoop.odps.cupid.smartnat.enable",true)
-      .getOrCreate()
-
-    LOG.info("probid calc start!")
-    println("probid calc start!")
-
-    val srcTableName = "ods_probid_commonpro_winbidding_companylist"
-
-    val df = spark.sql(s"SELECT  company_id,company_name,SUM(winbidding_id) FROM ${srcTableName} GROUP BY company_id,company_name LIMIT 100".stripMargin)
-    MongoSpark.save(
-      df
-        .write
-        .mode(SaveMode.Append)
-    )
-
-    LOG.info("probid calc end!")
-    println("probid calc end!")
-    spark.stop();
-  }
-}

+ 28 - 0
src/main/scala/com/winhc/bigdata/spark/test/TestSparkSql.scala

@@ -0,0 +1,28 @@
+package com.winhc.bigdata.spark.test
+
+import com.winhc.bigdata.spark.utils.SparkUtils
+import org.apache.spark.internal.Logging
+
+import scala.collection.mutable
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/5/25 08:39
+ * @Description:
+ */
+object TestSparkSql extends Logging {
+  def main(args: Array[String]): Unit = {
+    logInfo("start。。。")
+    val map = mutable.Map[String, String](
+      "spark.hadoop.odps.spark.local.partition.amt" -> "10"
+    )
+    val spark = SparkUtils.InitEnv("test",map)
+    import spark._
+    val df = sql("select * from ods_company limit 100")
+    df.printSchema()
+    df.foreach(println(_))
+    logInfo("end")
+    spark.stop()
+  }
+
+}

+ 15 - 0
src/main/scala/com/winhc/bigdata/spark/utils/SparkUtils.scala

@@ -1,11 +1,25 @@
 package com.winhc.bigdata.spark.utils
 
+import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants}
+import org.apache.hadoop.hbase.mapred.TableOutputFormat
+import org.apache.hadoop.mapred.JobConf
 import org.apache.spark.sql.SparkSession
 
 import scala.collection.mutable
 
 object SparkUtils {
 
+  def HBaseOutputJobConf(outputTable: String): JobConf = {
+    val config = HBaseConfiguration.create()
+    val zkAddress = "hb-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com"
+    config.set(HConstants.ZOOKEEPER_QUORUM, zkAddress);
+
+    val jobConf = new JobConf(config)
+    jobConf.setOutputFormat(classOf[TableOutputFormat])
+    jobConf.set(TableOutputFormat.OUTPUT_TABLE, outputTable)
+    jobConf
+  }
+
   def InitEnv(appName: String): SparkSession = {
     InitEnv(appName, null)
   }
@@ -36,6 +50,7 @@ object SparkUtils {
     }
     spark.getOrCreate()
   }
+
   def InitEnvRaw(appName: String) = {
     val spark = SparkSession
       .builder()