5 years ago · de4438f583
--- a/pom.xml
+++ b/pom.xml
@@ -101,9 +101,9 @@
 
				             <version>${cupid.sdk.version}</version>
			
 
				         </dependency>
			
 
				         <dependency>
			
 
				-            <groupId>org.mongodb</groupId>
			
 
				-            <artifactId>mongo-java-driver</artifactId>
			
 
				-            <version>3.4.2</version>
			
 
				+            <groupId>org.apache.hbase.connectors.spark</groupId>
			
 
				+            <artifactId>hbase-spark</artifactId>
			
 
				+            <version>1.0.0</version>
			
 
				         </dependency>
			
 
				     </dependencies>
			
 
				 
			
@@ -123,21 +123,19 @@
 
				                             <minimizeJar>false</minimizeJar>
			
 
				                             <shadedArtifactAttached>true</shadedArtifactAttached>
			
 
				                             <artifactSet>
			
 
				-<!--                                <excludes>-->
			
 
				-<!--                                    <exclude>jmock:*</exclude>-->
			
 
				-<!--                                    <exclude>*:xml-apis</exclude>-->
			
 
				-<!--                                    <exclude>org.apache.maven:lib:tests</exclude>-->
			
 
				-<!--                                    <exclude>log4j:log4j:jar:</exclude>-->
			
 
				-<!--                                </excludes>-->
			
 
				+                                <!--                                <excludes>-->
			
 
				+                                <!--                                    <exclude>jmock:*</exclude>-->
			
 
				+                                <!--                                    <exclude>*:xml-apis</exclude>-->
			
 
				+                                <!--                                    <exclude>org.apache.maven:lib:tests</exclude>-->
			
 
				+                                <!--                                    <exclude>log4j:log4j:jar:</exclude>-->
			
 
				+                                <!--                                </excludes>-->
			
 
				                                 <includes>
			
 
				-<!--                                    <include>*:*</include>-->
			
 
				+                                    <!--                                    <include>*:*</include>-->
			
 
				                                     <include>cn.hutool:*</include>
			
 
				-                                    <include>org.mongodb:*</include>
			
 
				-                                    <include>org.mongodb.spark:*</include>
			
 
				                                     <include>com.aliyun.odps:*</include>
			
 
				                                     <include>org.mongodb.*:*</include>
			
 
				-                                    <include>org.mongodb:mongo-java-driver</include>
			
 
				-<!--                                    <include>com.aliyun.odps:odps-spark-datasource_2.11:*</include>-->
			
 
				+                                    <include>org.apache.hbase:*</include>
			
 
				+                                    <!--                                    <include>com.aliyun.odps:odps-spark-datasource_2.11:*</include>-->
			
 
				                                 </includes>
			
 
				                             </artifactSet>
			
 
				                             <filters>
			
--- a/src/main/scala/com/winhc/bigdata/spark/jobs/CompanyNameMapping.scala
+++ b/src/main/scala/com/winhc/bigdata/spark/jobs/CompanyNameMapping.scala
@@ -0,0 +1,126 @@
 
				+package com.winhc.bigdata.spark.jobs
			
 
				+
			
 
				+import com.aliyun.odps.utils.StringUtils
			
 
				+import com.winhc.bigdata.spark.utils.SparkUtils
			
 
				+import org.apache.hadoop.hbase.client.{Get, Put, Table}
			
 
				+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
			
 
				+import org.apache.hadoop.hbase.spark.HBaseContext
			
 
				+import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
			
 
				+import org.apache.hadoop.hbase.util.Bytes
			
 
				+import org.apache.hadoop.hbase.{CellUtil, TableName}
			
 
				+import org.apache.spark.internal.Logging
			
 
				+import org.apache.spark.sql.Row
			
 
				+import org.apache.spark.sql.types.{StringType, StructField, StructType}
			
 
				+
			
 
				+import scala.collection.mutable
			
 
				+import scala.collection.mutable.Set
			
 
				+
			
 
				+/**
			
 
				+ * @Author: XuJiakai
			
 
				+ * @Date: 2020/5/26 10:41
			
 
				+ * @Description: 公司名称映射关系计算
			
 
				+ */
			
 
				+object CompanyNameMapping extends Logging {
			
 
				+  val f_bytes: Array[Byte] = Bytes.toBytes("f")
			
 
				+  val name_bytes: Array[Byte] = Bytes.toBytes("name")
			
 
				+  val current_cid_bytes: Array[Byte] = Bytes.toBytes("current_cid")
			
 
				+
			
 
				+
			
 
				+  def getCurrentIdAndName(table: Table, cid: String): (String, String) = {
			
 
				+    var current_cid: String = cid
			
 
				+    var name: String = null
			
 
				+    val mutableSet = Set(cid)
			
 
				+    while (true) {
			
 
				+      val result = table.get(new Get(Bytes.toBytes(current_cid)))
			
 
				+
			
 
				+      if (result.isEmpty) {
			
 
				+        return (null, null)
			
 
				+      }
			
 
				+      val tmp_name = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, name_bytes) get (0)))
			
 
				+      var tmp_current_cid: String = null
			
 
				+      try {
			
 
				+        tmp_current_cid = Bytes.toString(CellUtil.cloneValue(result.getColumnCells(f_bytes, current_cid_bytes) get (0)))
			
 
				+      } catch {
			
 
				+        case e: Exception => {
			
 
				+        }
			
 
				+      }
			
 
				+      if (StringUtils.isEmpty(tmp_current_cid)) {
			
 
				+        return (current_cid, tmp_name)
			
 
				+      }
			
 
				+
			
 
				+      if (mutableSet.contains(tmp_current_cid)) {
			
 
				+        return (null, null)
			
 
				+      }
			
 
				+      mutableSet += tmp_current_cid
			
 
				+      current_cid = tmp_current_cid
			
 
				+      name = tmp_name
			
 
				+    }
			
 
				+    (current_cid, name)
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val map = mutable.Map[String, String](
			
 
				+      "spark.hadoop.odps.cupid.vpc.domain.list" -> "{\"regionId\":\"cn-shanghai\",\"vpcs\":[{\"vpcId\":\"vpc-11hby9xee\",\"zones\":[{\"urls\":[{\"domain\":\"dds-uf6ff5dfd9aef3641.mongodb.rds.aliyuncs.com\",\"port\":3717},{\"domain\":\"dds-uf6ff5dfd9aef3642.mongodb.rds.aliyuncs.com\",\"port\":3717},{\"domain\":\"hb-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com\",\"port\":2181}]}]}]}"
			
 
				+    )
			
 
				+
			
 
				+    val hbaseKVTable = "company_name_kv"
			
 
				+    val resultTable = "company_name_mapping_2"
			
 
				+    val inputTable = "new_ods_company"
			
 
				+
			
 
				+    val spark = SparkUtils.InitEnv("CompanyNameMapping", map)
			
 
				+    import spark._
			
 
				+    val df = sql(s"select cid,name,current_cid from $inputTable")
			
 
				+
			
 
				+    val jobConf = SparkUtils.HBaseOutputJobConf(hbaseKVTable)
			
 
				+    df.rdd.map(row => {
			
 
				+      val id = row(0).asInstanceOf[Long].toString
			
 
				+      val name = row(1).asInstanceOf[String]
			
 
				+      val current_cid = row(2).asInstanceOf[Long].toString
			
 
				+      val put = new Put(Bytes.toBytes(id))
			
 
				+      put.addColumn(f_bytes, name_bytes, Bytes.toBytes(name))
			
 
				+      if (!"0".equals(current_cid))
			
 
				+        put.addColumn(f_bytes, current_cid_bytes, Bytes.toBytes(current_cid))
			
 
				+      (new ImmutableBytesWritable, put)
			
 
				+    }).saveAsHadoopDataset(jobConf)
			
 
				+
			
 
				+    logInfo("save hbase success!")
			
 
				+
			
 
				+    val df_old = sql(s"select cid,name,current_cid from $inputTable where current_cid is not null")
			
 
				+
			
 
				+    val hbaseContext = new HBaseContext(spark.sparkContext, jobConf)
			
 
				+
			
 
				+    val res_rdd = df_old.rdd.hbaseMapPartitions(hbaseContext, (f, con) => {
			
 
				+      val table = con.getTable(TableName.valueOf(hbaseKVTable))
			
 
				+      val rdd_par = f.map(record => {
			
 
				+        try {
			
 
				+          val cid = record.getLong(0).toString
			
 
				+          val name = record.getString(1)
			
 
				+          val current_cid = record.getLong(2).toString
			
 
				+          val (res_cid, res_name) = getCurrentIdAndName(table, current_cid)
			
 
				+          Row(cid, name, current_cid, res_cid, res_name)
			
 
				+        } catch {
			
 
				+          case e: Exception => {
			
 
				+            logWarning(record.toString())
			
 
				+            logError(e.getMessage, e)
			
 
				+          }
			
 
				+            null
			
 
				+        }
			
 
				+      })
			
 
				+      table.close()
			
 
				+      rdd_par
			
 
				+    }).filter(_ != null)
			
 
				+
			
 
				+    val schema = StructType(Array(
			
 
				+      StructField("cid", StringType),
			
 
				+      StructField("name", StringType),
			
 
				+      StructField("current_cid", StringType),
			
 
				+      StructField("res_cid", StringType),
			
 
				+      StructField("res_name", StringType)))
			
 
				+
			
 
				+    val res_df = spark.createDataFrame(res_rdd, schema)
			
 
				+    res_df.write.mode("append").insertInto(resultTable)
			
 
				+    logInfo("CompanyNameMapping success")
			
 
				+    spark.stop()
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/winhc/bigdata/spark/jobs/CompanySummaryBySingle.scala
+++ b/src/main/scala/com/winhc/bigdata/spark/jobs/CompanySummaryBySingle.scala
@@ -3,7 +3,7 @@ package com.winhc.bigdata.spark.jobs
 
				 import com.mongodb.spark.MongoSpark
			
 
				 import com.winhc.bigdata.spark.utils.CompanySummaryUtils._
			
 
				 import com.winhc.bigdata.spark.utils.SparkUtils
			
 
				-import org.apache.commons.logging.LogFactory
			
 
				+import org.apache.spark.internal.Logging
			
 
				 
			
 
				 import scala.collection.mutable
			
 
				 
			
@@ -12,12 +12,11 @@ import scala.collection.mutable
 
				  * @Date: 2020/5/21 11:18
			
 
				  * @Description: 单表写入摘要
			
 
				  */
			
 
				-object CompanySummaryBySingle {
			
 
				-  private val log = LogFactory.getLog(this.getClass)
			
 
				+object CompanySummaryBySingle extends Logging {
			
 
				 
			
 
				   def main(args: Array[String]): Unit = {
			
 
				     if (args.length < 1) {
			
 
				-      log.error("请输入表名！")
			
 
				+      logError("请输入表名！")
			
 
				       sys.exit(-1)
			
 
				     }
			
 
				     val tableName = args(0)
			
--- a/src/main/scala/com/winhc/bigdata/spark/summary/ProbidCalculator.scala
+++ b/src/main/scala/com/winhc/bigdata/spark/summary/ProbidCalculator.scala
@@ -1,44 +0,0 @@
 
				-package com.winhc.bigdata.spark.summary
			
 
				-
			
 
				-import com.mongodb.spark.MongoSpark
			
 
				-import com.winhc.bigdata.spark.jobs.CompanyInfoCalculator.{LOG, prepare}
			
 
				-import com.winhc.bigdata.spark.utils.SparkUtils
			
 
				-import org.apache.commons.logging.LogFactory
			
 
				-import org.apache.spark.sql.{SaveMode, SparkSession}
			
 
				-import org.apache.spark.sql.SparkSession
			
 
				-
			
 
				-object ProbidCalculator {
			
 
				-  private val LOG = LogFactory.getLog(this.getClass)
			
 
				-
			
 
				-  def main(args: Array[String]): Unit = {
			
 
				-    val database = "itslaw"
			
 
				-    val collection = "probid_commonpro"
			
 
				-//    val host = "dds-uf6ff5dfd9aef3641601-pub.mongodb.rds.aliyuncs.com:3717,dds-uf6ff5dfd9aef3642555-pub.mongodb.rds.aliyuncs.com:3717/itslaw?replicaSet=mgset-6501997"
			
 
				-    val host = "dds-uf6ff5dfd9aef3641.mongodb.rds.aliyuncs.com:3717,dds-uf6ff5dfd9aef3642.mongodb.rds.aliyuncs.com:3717/itslaw?replicaSet=mgset-6501997"
			
 
				-    val outPutUri = s"mongodb://itslaw:itslaw_168@$host"
			
 
				-    val spark: SparkSession = SparkUtils.InitEnvRaw("ProbidCalculator")
			
 
				-      .config("spark.mongodb.input.uri", outPutUri)
			
 
				-      .config("spark.mongodb.input.collection", collection)
			
 
				-      .config("spark.mongodb.output.uri", outPutUri)
			
 
				-      .config("spark.mongodb.output.database",database)
			
 
				-      .config("spark.mongodb.output.collection",collection)
			
 
				-      .config("spark.hadoop.odps.cupid.smartnat.enable",true)
			
 
				-      .getOrCreate()
			
 
				-
			
 
				-    LOG.info("probid calc start!")
			
 
				-    println("probid calc start!")
			
 
				-
			
 
				-    val srcTableName = "ods_probid_commonpro_winbidding_companylist"
			
 
				-
			
 
				-    val df = spark.sql(s"SELECT  company_id,company_name,SUM(winbidding_id) FROM ${srcTableName} GROUP BY company_id,company_name LIMIT 100".stripMargin)
			
 
				-    MongoSpark.save(
			
 
				-      df
			
 
				-        .write
			
 
				-        .mode(SaveMode.Append)
			
 
				-    )
			
 
				-
			
 
				-    LOG.info("probid calc end!")
			
 
				-    println("probid calc end!")
			
 
				-    spark.stop();
			
 
				-  }
			
 
				-}
			
--- a/src/main/scala/com/winhc/bigdata/spark/test/TestSparkSql.scala
+++ b/src/main/scala/com/winhc/bigdata/spark/test/TestSparkSql.scala
@@ -0,0 +1,28 @@
 
				+package com.winhc.bigdata.spark.test
			
 
				+
			
 
				+import com.winhc.bigdata.spark.utils.SparkUtils
			
 
				+import org.apache.spark.internal.Logging
			
 
				+
			
 
				+import scala.collection.mutable
			
 
				+
			
 
				+/**
			
 
				+ * @Author: XuJiakai
			
 
				+ * @Date: 2020/5/25 08:39
			
 
				+ * @Description:
			
 
				+ */
			
 
				+object TestSparkSql extends Logging {
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    logInfo("start。。。")
			
 
				+    val map = mutable.Map[String, String](
			
 
				+      "spark.hadoop.odps.spark.local.partition.amt" -> "10"
			
 
				+    )
			
 
				+    val spark = SparkUtils.InitEnv("test",map)
			
 
				+    import spark._
			
 
				+    val df = sql("select * from ods_company limit 100")
			
 
				+    df.printSchema()
			
 
				+    df.foreach(println(_))
			
 
				+    logInfo("end")
			
 
				+    spark.stop()
			
 
				+  }
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/winhc/bigdata/spark/utils/SparkUtils.scala
+++ b/src/main/scala/com/winhc/bigdata/spark/utils/SparkUtils.scala
@@ -1,11 +1,25 @@
 
				 package com.winhc.bigdata.spark.utils
			
 
				 
			
 
				+import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants}
			
 
				+import org.apache.hadoop.hbase.mapred.TableOutputFormat
			
 
				+import org.apache.hadoop.mapred.JobConf
			
 
				 import org.apache.spark.sql.SparkSession
			
 
				 
			
 
				 import scala.collection.mutable
			
 
				 
			
 
				 object SparkUtils {
			
 
				 
			
 
				+  def HBaseOutputJobConf(outputTable: String): JobConf = {
			
 
				+    val config = HBaseConfiguration.create()
			
 
				+    val zkAddress = "hb-uf6as8i6h85k02092-001.hbase.rds.aliyuncs.com"
			
 
				+    config.set(HConstants.ZOOKEEPER_QUORUM, zkAddress);
			
 
				+
			
 
				+    val jobConf = new JobConf(config)
			
 
				+    jobConf.setOutputFormat(classOf[TableOutputFormat])
			
 
				+    jobConf.set(TableOutputFormat.OUTPUT_TABLE, outputTable)
			
 
				+    jobConf
			
 
				+  }
			
 
				+
			
 
				   def InitEnv(appName: String): SparkSession = {
			
 
				     InitEnv(appName, null)
			
 
				   }
			
@@ -36,6 +50,7 @@ object SparkUtils {
 
				     }
			
 
				     spark.getOrCreate()
			
 
				   }
			
 
				+
			
 
				   def InitEnvRaw(appName: String) = {
			
 
				     val spark = SparkSession
			
 
				       .builder()