|
@@ -0,0 +1,213 @@
|
|
|
+package com.winhc.bigdata.spark.ng.relation
|
|
|
+
|
|
|
+import com.alibaba.fastjson.{JSON, JSONArray, JSONPath}
|
|
|
+import com.winhc.bigdata.spark.config.EsConfig
|
|
|
+import com.winhc.bigdata.spark.udf.{BaseFunc, CompanyMapping}
|
|
|
+import com.winhc.bigdata.spark.utils.BaseUtil.is_json_str
|
|
|
+import com.winhc.bigdata.spark.utils.{BaseUtil, LoggingUtils, SparkUtils}
|
|
|
+import org.apache.commons.lang3.StringUtils
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+import scala.collection.mutable
|
|
|
+
|
|
|
+/**
|
|
|
+ * @Description:疑似问题数据回流线下处理
|
|
|
+ * @author π
|
|
|
+ * @date 2021/4/22 10:59
|
|
|
+ */
|
|
|
+case class job_params(tableName: String
|
|
|
+ , rowkey: String = "rowkey" //主键rowkey
|
|
|
+ , names: String = "" // 炸开语句
|
|
|
+ )
|
|
|
+
|
|
|
+object job_params {
|
|
|
+ val tab_args = Seq(
|
|
|
+ job_params(tableName = "company_court_open_announcement"
|
|
|
+ , names = "split_names(CONCAT_WS('&',litigant_info,plaintiff_info,defendant_info),'$.name')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_court_announcement"
|
|
|
+ , names = "split_names(CONCAT_WS('&',plaintiff_info,litigant_info),'$.name')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_send_announcement"
|
|
|
+ , names = "split_names(CONCAT_WS('&',litigant_info,plaintiff_info,defendant_info),'$.name')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_court_register"
|
|
|
+ , names = "split_names(CONCAT_WS('&',litigant_info,plaintiff_info,defendant_info),'$.name')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_zxr_final_case"
|
|
|
+ , names = "split(CONCAT_WS('&',name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_equity_info"
|
|
|
+ , names = "split_names(CONCAT_WS('&', pledgor_info, pledgee_info),'$.pledgor&$.pledgee')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_zxr"
|
|
|
+ , names = "split(CONCAT_WS('&',name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_dishonest_info"
|
|
|
+ , names = "split(CONCAT_WS('&',name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_zxr_restrict"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "zxr_evaluate_results"
|
|
|
+ , names = "split(CONCAT_WS('&',name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "zxr_evaluate"
|
|
|
+ , names = "split(CONCAT_WS('&',name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "restrictions_on_exit"
|
|
|
+ , names = "split(CONCAT_WS('&',executed_person_keyno),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_punishment_info"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_punishment_info_creditchina"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "bankruptcy_open_case"
|
|
|
+ , names = "split_names(CONCAT_WS('&',applicant_info, respondent_info),'$.name')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_judicial_assistance"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_land_mortgage"
|
|
|
+ , names = "split(CONCAT_WS('&',mortgagor,mortgagee),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_public_announcement"
|
|
|
+ , names = "split(CONCAT_WS('&',pay_bank,gather_name,drawer,owner,apply_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_abnormal_info"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_illegal_info"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "auction_tracking"
|
|
|
+ , names = "split_names(CONCAT_WS('&',company_info),'$.company_name')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_brief_cancel_announcement"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_lawsuit"
|
|
|
+ , names = "split_names(CONCAT_WS('&',litigant_info,plaintiff_info,defendant_info),'$.name')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_check_info"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_double_random_check_info"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_mortgage_info"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_own_tax"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ , job_params(tableName = "company_tax_contravention"
|
|
|
+ , names = "split(CONCAT_WS('&',company_name),'&')"
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ def get_args_company_job(tn: String): job_params = {
|
|
|
+ tab_args.find(p => tn.equals(p.tableName)).getOrElse(throw new NullPointerException("tn is not fount"))
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+case class company_back_update(s: SparkSession
|
|
|
+ , project: String //表所在工程名
|
|
|
+ , job_params: job_params //入参
|
|
|
+ ) extends LoggingUtils with BaseFunc with CompanyMapping {
|
|
|
+ override protected val spark: SparkSession = s
|
|
|
+ val tn: String = job_params.tableName
|
|
|
+ val rowkey: String = job_params.rowkey
|
|
|
+ val names: String = job_params.names
|
|
|
+ val ads_tab = s"$project.ads_$tn"
|
|
|
+ val inc_ads_tab = s"$project.inc_ads_$tn"
|
|
|
+
|
|
|
+ //回流结果表
|
|
|
+ val company_back_update = s"$project.tmp_xf_company_back_update"
|
|
|
+ //疑似问题公司名称
|
|
|
+ //val companyid_name_mapping = s"$project.tmp_xf_companyid_name_mapping_tmp"
|
|
|
+ //曾用名问题数据
|
|
|
+ val companyid_name_mapping = s"$project.tmp_xf_spider_rizhi_name"
|
|
|
+
|
|
|
+ val inter_cols: Seq[String] = getColumns(ads_tab).intersect(getColumns(inc_ads_tab))
|
|
|
+ val lastDs: String = BaseUtil.getYesterday()
|
|
|
+
|
|
|
+ register()
|
|
|
+
|
|
|
+
|
|
|
+ private def register(): Unit = {
|
|
|
+ prepareFunctions(spark)
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ def calc() = {
|
|
|
+
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |INSERT OVERWRITE TABLE $company_back_update PARTITION(ds='$lastDs',tn='$tn')
|
|
|
+ |SELECT $rowkey
|
|
|
+ |FROM (
|
|
|
+ | SELECT $rowkey
|
|
|
+ | FROM (
|
|
|
+ | SELECT $rowkey
|
|
|
+ | ,name
|
|
|
+ | FROM (
|
|
|
+ | SELECT $rowkey
|
|
|
+ | ,name
|
|
|
+ | FROM (
|
|
|
+ | SELECT $rowkey
|
|
|
+ | ,$names names
|
|
|
+ | FROM $ads_tab
|
|
|
+ | WHERE ds > 0
|
|
|
+ | UNION ALL
|
|
|
+ | SELECT $rowkey
|
|
|
+ | ,$names names
|
|
|
+ | FROM $inc_ads_tab
|
|
|
+ | WHERE ds > 0
|
|
|
+ | )
|
|
|
+ | LATERAL VIEW explode(names) b AS name
|
|
|
+ | )
|
|
|
+ | WHERE LENGTH(cleanup(name)) > 5
|
|
|
+ | ) a
|
|
|
+ | JOIN (
|
|
|
+ | SELECT name
|
|
|
+ | FROM $companyid_name_mapping
|
|
|
+ | GROUP BY name
|
|
|
+ | ) b
|
|
|
+ | ON name_cleanup(a.name) = name_cleanup(b.name)
|
|
|
+ | )
|
|
|
+ |GROUP BY rowkey
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+ sql(
|
|
|
+ s"""
|
|
|
+ |ALTER TABLE $company_back_update ADD IF NOT EXISTS PARTITION(ds='$lastDs',tn='$tn')
|
|
|
+ |""".stripMargin)
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+object company_back_update {
|
|
|
+ def main(args: Array[String]): Unit = {
|
|
|
+ if (args.size != 2) {
|
|
|
+ println(args.mkString(","))
|
|
|
+ println("please set project tn.")
|
|
|
+ sys.exit(-1)
|
|
|
+ }
|
|
|
+ val Array(project, tn) = args
|
|
|
+ val config = EsConfig.getEsConfigMap ++ mutable.Map(
|
|
|
+ "spark.hadoop.odps.project.name" -> project,
|
|
|
+ "spark.debug.maxToStringFields" -> "200",
|
|
|
+ "spark.hadoop.odps.spark.local.partition.amt" -> "1000"
|
|
|
+ )
|
|
|
+ val spark = SparkUtils.InitEnv(this.getClass.getSimpleName, config)
|
|
|
+ val re = company_back_update(s = spark, project = project, job_params.get_args_company_job(tn))
|
|
|
+ re.calc()
|
|
|
+ spark.stop()
|
|
|
+ }
|
|
|
+}
|