许家凯 4 年之前
父節點
當前提交
aa86eb7b01

+ 24 - 0
src/main/scala/com/winhc/bigdata/spark/jobs/CompanySummaryInc.scala

@@ -0,0 +1,24 @@
+package com.winhc.bigdata.spark.jobs
+
+import com.winhc.bigdata.spark.utils.{CompanyIncSummary, SparkUtils}
+
+import scala.collection.mutable
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/6/24 16:07
+ * @Description:
+ */
+object CompanySummaryInc {
+  //winhc_eci_dev company_icp new_cid liscense,domain,new_cid
+  def main(args: Array[String]): Unit = {
+    val Array(project, tableName, cidField, dupliCols) = args
+
+    val config = mutable.Map(
+      "spark.hadoop.odps.project.name" -> project
+    )
+    val spark = SparkUtils.InitEnv("CompanySummaryInc", config)
+    CompanyIncSummary(spark, project, tableName, cidField, dupliCols.split(",").seq).calc
+    spark.stop()
+  }
+}

+ 104 - 0
src/main/scala/com/winhc/bigdata/spark/utils/CompanyIncSummary.scala

@@ -0,0 +1,104 @@
+package com.winhc.bigdata.spark.utils
+
+import org.apache.hadoop.hbase.client.Put
+import org.apache.hadoop.hbase.io.ImmutableBytesWritable
+import org.apache.hadoop.hbase.util.Bytes
+import org.apache.spark.sql.SparkSession
+
+import scala.annotation.meta.getter
+
+
+/**
+ * @Author: XuJiakai
+ * @Date: 2020/6/22 16:33
+ * @Description:
+ */
+case class CompanyIncSummary(s: SparkSession,
+                             project: String, //表所在工程名
+                             tableName: String, //主表名(不加前辍)
+                             cidField: String, // 公司id fieldName
+                             dupliCols: Seq[String] //去重主键
+                            ) extends LoggingUtils {
+  @(transient@getter) val spark: SparkSession = s
+  private val f_bytes: Array[Byte] = Bytes.toBytes("F")
+  private val name_bytes: Array[Byte] = Bytes.toBytes(tableName.toUpperCase)
+
+  def calc(): Unit = {
+    val ads_table = s"${project}.ads_$tableName" //存量ads表
+    val inc_ads_table = s"${project}.inc_ads_$tableName"
+
+
+    val partition = sql(s"show partitions $ads_table").collect.toList
+      .map(_.getString(0).split("=")(1))
+      .last
+
+
+    val jobConf = HBaseUtils.HBaseOutputJobConf("COMPANY_SUMMARY")
+
+    val ads_table_cols = spark.table(ads_table).columns.filter(l => {
+      !l.equals("ds") && !l.equals("rowkey") && !l.equals("flag")
+    }).toList.sorted
+
+    val inc_ads_table_cols = spark.table(inc_ads_table).columns.filter(l => {
+      !l.equals("ds") && !l.equals("rowkey") && !l.equals("flag")
+    }).toList.sorted
+
+    val new_cols = (ads_table_cols ::: inc_ads_table_cols).distinct.sorted
+    if (new_cols.size != inc_ads_table_cols.size || new_cols.size != ads_table_cols.size) {
+      println(ads_table_cols)
+      println(inc_ads_table_cols)
+      println("cols not equals!")
+      sys.exit(-99)
+    }
+    sql(
+      s"""
+         |CREATE TABLE IF NOT EXISTS ${project}.xjk_tmp_count_$tableName as
+         |SELECT  ${cidField} as cid
+         |        ,COUNT(1) as num
+         |FROM    (
+         |            SELECT  tmp.*
+         |                    ,ROW_NUMBER() OVER(PARTITION BY ${dupliCols.mkString(",")} ORDER BY update_time DESC ) c
+         |            FROM    (
+         |                        SELECT  ${new_cols.map(getCastCols(_, "org_tab.")).mkString(",")}
+         |                        FROM    (
+         |                                    SELECT  DISTINCT $cidField as $cidField
+         |                                    FROM    $inc_ads_table
+         |                                    WHERE   ds > $partition
+         |                                ) id_table
+         |                        JOIN (
+         |                                      SELECT  *
+         |                                      FROM    $ads_table
+         |                                      WHERE   ds = '$partition'
+         |                                  ) org_tab
+         |                        ON      id_table.$cidField = org_tab.$cidField
+         |                        UNION ALL
+         |                        SELECT  ${new_cols.map(getCastCols(_, "")).mkString(",")}
+         |                        FROM    $inc_ads_table
+         |                        WHERE   ds > $partition
+         |                    ) AS tmp
+         |        ) tmp2
+         |WHERE   tmp2.c = 1
+         |GROUP BY $cidField
+         |""".stripMargin)
+//      .write.mode("overwrite").saveAsTable(s"${project}.xjk_tmp_count_$tableName")
+    /* .rdd.map(row => {
+     val id = row(0).asInstanceOf[String]
+     val num = row(1).asInstanceOf[String]
+     val put = new Put(Bytes.toBytes(id))
+     if (!"0".equals(num)) {
+       put.addColumn(f_bytes, name_bytes, Bytes.toBytes(num))
+       (new ImmutableBytesWritable, put)
+     } else {
+       return null
+     }
+   }).filter(_ != null).saveAsHadoopDataset(jobConf)*/
+  }
+
+  def getCastCols(name: String, pre: String): String = {
+    val list = List("cid", "new_cid", "ncid")
+    if (list.contains(name)) {
+      return s"CAST(${pre}${name} as BIGINT) $name"
+    }
+    pre + name
+  }
+}

+ 28 - 0
src/main/scala/com/winhc/bigdata/spark/utils/CompanySummaryUtils.scala

@@ -6,6 +6,34 @@ package com.winhc.bigdata.spark.utils
  * @Description:
  */
 object CompanySummaryUtils {
+
+  val map = Map(
+    "company_license" -> ("new_cid", "company_license"), // 行政许可-工商局
+    "company_abnormal_info" -> ("new_cid", "company_abnormal_info"), // 经营异常
+    "company_liquidating_info" -> ("new_cid", "company_liquidating_info"), // 清算信息
+    "company_equity_info_list" -> ("cid", "company_equity_info_list"), // 股权出质
+    "company_judicial_assistance" -> ("new_cid", "company_judicial_assistance"), // 司法协助
+    "company_illegal_info" -> ("new_cid", "company_illegal_info"), // 严重违法
+    "company_license_entpub" -> ("new_cid", "company_license_entpub"), // 行政许可-企业公示
+    "company_patent_list" -> ("new_cid", "company_patent_list"), // 专利
+    "company_copyright_works_list" -> ("new_cid", "company_copyright_works_list"), // 作品著作权
+    "company_copyright_reg_list" -> ("new_cid", "company_copyright_reg_list"), // 软件著作权
+    "company_wechat" -> ("new_cid", "company_wechat"), // 微信公众号
+    "company_tm" -> ("new_cid", "company_tm"), // 商标
+    "company_icp" -> ("new_cid", "company_icp"), // 网站备案
+    "company_app_info" -> ("new_cid", "company_app_info"), // 产品信息
+    "company_license_creditchina" -> ("new_cid", "company_license_creditchina"), // 行政许可-信用中国
+    "company_court_announcement_list" -> ("cid", "company_court_announcement_list"), // 法院公告
+    "company_env_punishment" -> ("cid", "company_env_punishment"), // 环保处罚
+    "company_punishment_info" -> ("cid", "company_punishment_info"), // 行政处罚
+    "company_punishment_info_creditchina" -> ("cid", "company_punishment_info_creditchina"), // 行政处罚-信用中国
+    "company_land_transfer" -> ("cid", "company_land_transfer"), // 土地转让
+    "company_land_publicity" -> ("cid", "company_land_publicity"), // 地块公示
+    "company_land_mortgage" -> ("cid", "company_land_mortgage"), // 土地抵押
+    "company_land_announcement" -> ("cid", "company_land_announcement"), // 购地信息
+    "company_bid_list" -> ("cid", "company_bid_list") // 招投标
+  )
+
   def getSummarySql(tableName: String, companyIdFieldName: String) = s"select $companyIdFieldName as company_id,count(1) as ${tableName}_num from $tableName where $companyIdFieldName <>0 group by $companyIdFieldName"
 
   def main(args: Array[String]): Unit = {