|
@@ -3,6 +3,7 @@ package com.winhc.bigdata.spark.implicits
|
|
|
import java.time.LocalDate
|
|
|
import java.time.format.DateTimeFormatter
|
|
|
|
|
|
+import com.winhc.bigdata.spark.utils.DateUtils
|
|
|
import org.apache.commons.lang3.StringUtils
|
|
|
import org.apache.spark.broadcast.Broadcast
|
|
|
import org.apache.spark.sql.DataFrame
|
|
@@ -34,6 +35,10 @@ object CompanyIndexSave2EsHelper {
|
|
|
, "reg_capital_amount" // 注册资本,数值类型
|
|
|
, "phones" //电话
|
|
|
, "emails" //邮箱
|
|
|
+
|
|
|
+ , "legal_entity_id" //法人id
|
|
|
+ , "legal_entity_type" //法人类型,1 人 2 公司
|
|
|
+ , "logo" //公司logo
|
|
|
)
|
|
|
|
|
|
implicit class DataFrameEnhancer(df: DataFrame) {
|
|
@@ -45,21 +50,21 @@ object CompanyIndexSave2EsHelper {
|
|
|
}).toMap
|
|
|
getEsDoc(map, category_code, area_code)
|
|
|
})
|
|
|
- .saveToEsWithMeta("winhc-company-v6/company")
|
|
|
+ .saveToEsWithMeta("winhc-company-v7/company")
|
|
|
}
|
|
|
|
|
|
- /* def test2Es(spark:SparkSession,category_code: Broadcast[Map[String, Seq[String]]], area_code: Broadcast[Map[String, Seq[String]]]): Unit = {
|
|
|
- val rdd = df.select(companyIndexFields.map(column => col(column).cast("string")): _*)
|
|
|
- .rdd.map(r => {
|
|
|
- val map = companyIndexFields.map(f => {
|
|
|
- (f, r.getAs[String](f))
|
|
|
- }).toMap
|
|
|
- getEsDoc(map, category_code, area_code)
|
|
|
- }).map(r=>Row(r._1,r._2.estiblish_time))
|
|
|
- spark.createDataFrame(rdd,StructType(Array(StructField("cid",StringType),StructField("time",StringType))))
|
|
|
- .write.mode("overwrite")
|
|
|
- .insertInto("winhc_eci_dev.xjk_test_save_2_es_0721")
|
|
|
- }*/
|
|
|
+ /* def test2Es(spark:SparkSession,category_code: Broadcast[Map[String, Seq[String]]], area_code: Broadcast[Map[String, Seq[String]]]): Unit = {
|
|
|
+ val rdd = df.select(companyIndexFields.map(column => col(column).cast("string")): _*)
|
|
|
+ .rdd.map(r => {
|
|
|
+ val map = companyIndexFields.map(f => {
|
|
|
+ (f, r.getAs[String](f))
|
|
|
+ }).toMap
|
|
|
+ getEsDoc(map, category_code, area_code)
|
|
|
+ }).map(r=>Row(r._1,r._2.estiblish_time))
|
|
|
+ spark.createDataFrame(rdd,StructType(Array(StructField("cid",StringType),StructField("time",StringType))))
|
|
|
+ .write.mode("overwrite")
|
|
|
+ .insertInto("winhc_eci_dev.xjk_test_save_2_es_0721")
|
|
|
+ }*/
|
|
|
}
|
|
|
|
|
|
case class Geo(lat: String, lon: String)
|
|
@@ -90,6 +95,9 @@ object CompanyIndexSave2EsHelper {
|
|
|
, reg_capital_amount: String
|
|
|
, phones: Seq[String]
|
|
|
, emails: Seq[String]
|
|
|
+ , legal_entity_id: String
|
|
|
+ , legal_entity_type: String
|
|
|
+ , logo: String
|
|
|
)
|
|
|
|
|
|
val pattern = "[^\\u4e00-\\u9fa50-9a-zA-Z]".r
|
|
@@ -118,17 +126,8 @@ object CompanyIndexSave2EsHelper {
|
|
|
val category_third = category._4
|
|
|
|
|
|
val et = map("estiblish_time")
|
|
|
- var time = if (StringUtils.isNotBlank(et)) {
|
|
|
- if (et.contains(" ")) {
|
|
|
- et.split(" ")(0)
|
|
|
- } else {
|
|
|
- et
|
|
|
- }
|
|
|
- } else null
|
|
|
|
|
|
- if(!validateDf(time)){
|
|
|
- time = null
|
|
|
- }
|
|
|
+ val time: String = DateUtils.toMillisTimestamp(date = et)
|
|
|
|
|
|
val doc = CompanyDoc(
|
|
|
cname = getCompanyName(map("name"))
|
|
@@ -154,6 +153,9 @@ object CompanyIndexSave2EsHelper {
|
|
|
, reg_capital_amount = map("reg_capital_amount")
|
|
|
, phones = getSplit(map("phones"))
|
|
|
, emails = getSplit(map("emails"))
|
|
|
+ , legal_entity_id = map("legal_entity_id")
|
|
|
+ , legal_entity_type = map("legal_entity_type")
|
|
|
+ , logo = map("logo")
|
|
|
)
|
|
|
(map("cid"), doc)
|
|
|
}
|