|
@@ -0,0 +1,163 @@
|
|
|
+package com.winhc.bigdata.spark.jobs
|
|
|
+
|
|
|
+import com.winhc.bigdata.spark.const.BaseConst
|
|
|
+import com.winhc.bigdata.spark.utils.{BaseUtil, EsUtils, HBaseUtils, LoggingUtils, SparkUtils}
|
|
|
+import org.apache.hadoop.hbase.client.Put
|
|
|
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
|
|
|
+import org.apache.hadoop.hbase.util.Bytes
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+
|
|
|
+import scala.annotation.meta.getter
|
|
|
+import scala.collection.mutable
|
|
|
+
|
|
|
+/**
|
|
|
+ * @Author: XuJiakai
|
|
|
+ * @Date: 2020/6/29 10:28
|
|
|
+ * @Description: 增量公司基本信息数据写入到 ads 和 es
|
|
|
+ */
|
|
|
+object CompanyIncCompany2Es {
|
|
|
+ val outFields = Seq(
|
|
|
+ "CID"
|
|
|
+ , "BASE"
|
|
|
+ , "NAME"
|
|
|
+ , "NAME_EN"
|
|
|
+ , "NAME_ALIAS"
|
|
|
+ , "HISTORY_NAMES"
|
|
|
+ , "LEGAL_ENTITY_ID"
|
|
|
+ , "LEGAL_ENTITY_TYPE"
|
|
|
+ , "REG_NUMBER"
|
|
|
+ , "COMPANY_ORG_TYPE"
|
|
|
+ , "REG_LOCATION"
|
|
|
+ , "ESTIBLISH_TIME"
|
|
|
+ , "FROM_TIME"
|
|
|
+ , "TO_TIME"
|
|
|
+ , "BUSINESS_SCOPE"
|
|
|
+ , "REG_INSTITUTE"
|
|
|
+ , "APPROVED_TIME"
|
|
|
+ , "REG_STATUS"
|
|
|
+ , "REG_CAPITAL"
|
|
|
+ , "ORG_NUMBER"
|
|
|
+ , "ORG_APPROVED_INSTITUTE"
|
|
|
+ , "CURRENT_CID"
|
|
|
+ , "PARENT_CID"
|
|
|
+ , "COMPANY_TYPE"
|
|
|
+ , "CREDIT_CODE"
|
|
|
+ , "SCORE"
|
|
|
+ , "CATEGORY_CODE"
|
|
|
+ , "LAT"
|
|
|
+ , "LNG"
|
|
|
+ , "AREA_CODE"
|
|
|
+ , "REG_CAPITAL_AMOUNT"
|
|
|
+ , "REG_CAPITAL_CURRENCY"
|
|
|
+ , "ACTUAL_CAPITAL_AMOUNT"
|
|
|
+ , "ACTUAL_CAPITAL_CURRENCY"
|
|
|
+ , "REG_STATUS_STD"
|
|
|
+ , "SOCIAL_SECURITY_STAFF_NUM"
|
|
|
+ , "CANCEL_DATE"
|
|
|
+ , "CANCEL_REASON"
|
|
|
+ , "REVOKE_DATE"
|
|
|
+ , "REVOKE_REASON"
|
|
|
+ , "EMAILS"
|
|
|
+ , "PHONES"
|
|
|
+ , "WECHAT_PUBLIC_NUM"
|
|
|
+ , "LOGO"
|
|
|
+ , "CRAWLED_TIME"
|
|
|
+ , "CREATE_TIME"
|
|
|
+ , "UPDATE_TIME"
|
|
|
+ , "DELETED"
|
|
|
+ )
|
|
|
+
|
|
|
+ case class Company2Es(s: SparkSession, project: String, bizDate: String) extends LoggingUtils {
|
|
|
+ @(transient@getter) val spark: SparkSession = s
|
|
|
+
|
|
|
+ def calc() {
|
|
|
+ val partition = bizDate.replaceAll("\\-", "")
|
|
|
+ if (partition.length != 8) {
|
|
|
+ println("biz date is error!")
|
|
|
+ sys.exit(-99)
|
|
|
+ }
|
|
|
+ val inc_ods_partitions = BaseUtil.getPartitions(s"${project}.inc_ods_company", spark)
|
|
|
+ val end_partition = if (inc_ods_partitions.isEmpty) partition else inc_ods_partitions.last
|
|
|
+
|
|
|
+ val inc_ads_partitions = BaseUtil.getPartitions(s"${project}.inc_ads_company", spark)
|
|
|
+ val start_partition = if (inc_ads_partitions.isEmpty) '0' else inc_ads_partitions.last
|
|
|
+
|
|
|
+ if (start_partition.equals(end_partition)) {
|
|
|
+ println("start_partition == end_partition")
|
|
|
+ sys.exit(-999)
|
|
|
+ }
|
|
|
+
|
|
|
+ val companyCols = spark.table("ads_company").columns
|
|
|
+ .filter(!_.equals("ds"))
|
|
|
+ .seq
|
|
|
+
|
|
|
+ //读取数据
|
|
|
+ val df = sql(
|
|
|
+ s"""
|
|
|
+ |SELECT ${companyCols.mkString(",")}
|
|
|
+ |FROM $project.inc_ods_company
|
|
|
+ |WHERE ds > $start_partition and ds <= $end_partition
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+ df.createOrReplaceTempView("tmp_company_inc")
|
|
|
+
|
|
|
+ //写出到ads
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |INSERT ${if (BaseUtil.isWindows) "INTO" else "OVERWRITE"} TABLE ${project}.inc_ads_company PARTITION(ds='$end_partition')
|
|
|
+ |SELECT ${companyCols.mkString(",")}
|
|
|
+ |FROM
|
|
|
+ | tmp_company_inc
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+ import spark.implicits._
|
|
|
+ //写出到hbase
|
|
|
+ import org.apache.spark.sql.functions.col
|
|
|
+ val jobConf = HBaseUtils.HBaseOutputJobConf("COMPANY")
|
|
|
+ val stringDf = df.select(companyCols.map(column => col(column).cast("string")): _*)
|
|
|
+ stringDf.rdd.map(row => {
|
|
|
+ val id = row.getAs[String]("cid")
|
|
|
+ val put = new Put(Bytes.toBytes(id))
|
|
|
+ for (f <- outFields) {
|
|
|
+ val v = row.getAs[String](f.toLowerCase)
|
|
|
+ if (v != null) {
|
|
|
+ put.addColumn(BaseConst.F_BYTES, Bytes.toBytes(f), Bytes.toBytes(v))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ (new ImmutableBytesWritable, put)
|
|
|
+ }).filter(_ != null).saveAsHadoopDataset(jobConf)
|
|
|
+
|
|
|
+ //写出到es
|
|
|
+ import com.winhc.bigdata.spark.utils.CompanyEsUtils.getEsDoc
|
|
|
+ import org.elasticsearch.spark._
|
|
|
+ stringDf.map(r => {
|
|
|
+ val cid = r.getAs[String]("cid")
|
|
|
+ val cname = r.getAs[String]("name")
|
|
|
+ val history_names = r.getAs[String]("history_names")
|
|
|
+ val current_cid = r.getAs[String]("current_cid")
|
|
|
+ val company_type = r.getAs[String]("company_type")
|
|
|
+ getEsDoc(cid, cname, history_names, current_cid, company_type)
|
|
|
+ }).rdd.saveToEsWithMeta("winhc-company/company")
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ def main(args: Array[String]): Unit = {
|
|
|
+ if (args.length != 2) {
|
|
|
+ println("please enter project and bizDate!")
|
|
|
+ sys.exit(-99)
|
|
|
+ }
|
|
|
+
|
|
|
+ val Array(project, bizDate) = args
|
|
|
+
|
|
|
+ val config = EsUtils.getEsConfigMap ++ mutable.Map(
|
|
|
+ "spark.hadoop.odps.project.name" -> project,
|
|
|
+ "spark.hadoop.odps.spark.local.partition.amt" -> "10"
|
|
|
+ )
|
|
|
+
|
|
|
+ val spark = SparkUtils.InitEnv("company2Es", config)
|
|
|
+
|
|
|
+ Company2Es(spark, project, bizDate).calc
|
|
|
+ spark.stop()
|
|
|
+ }
|
|
|
+}
|