Просмотр исходного кода

feat: 公司类型、省市区代码、变更类型清洗

JimZhang 1 год назад
Родитель
Сommit
5b3d07bd49

+ 3 - 3
src/main/java/com/winhc/bigdata/udf/CompanyOrgTypeNew.java

@@ -14,7 +14,7 @@ import java.util.*;
  */
 public class CompanyOrgTypeNew extends UDF {
     private static final List<String> FOREIGN = Arrays.asList("中外", "外资", "外国", "外商");
-    private static final List<String> CHINA = Arrays.asList("国有", "国资");
+    private static final List<String> CHINA = Arrays.asList("国有", "国资", "全民");
 
     public List<String> evaluate(String company_org_type, String company_name, String credit_code) {
         if (isEmpty(company_name)) return null;
@@ -25,11 +25,11 @@ public class CompanyOrgTypeNew extends UDF {
 
 //工商注册类型判断
         List<String> result = new ArrayList<>();
-        if (contains(company_org_type, "股份") && !contains(company_org_type, "合作") && contains(company_name, "公司")&&contains(company_name,"股份")) {
+        if (contains(company_org_type, "股份") && !contains(company_org_type, "合作") && contains(company_name, "公司") && contains(company_name, "股份")) {
             result.add("股份有限公司");
         } else if (contains(company_org_type, "有限责任") || contains(company_name, "有限责任公司") || contains(company_name, "有限公司")) {
             result.add("有限责任公司");
-        } else if (contains(company_org_type, "股份合作")||contains(company_org_type,"股份制")) {
+        } else if (contains(company_org_type, "股份合作") || contains(company_org_type, "股份制")) {
             result.add("股份合作企业");
         } else if (contains(company_org_type, "个体") || contains(company_org_type, "个人经营")) {
             result.add("个体工商户");

Разница между файлами не показана из-за своего большого размера
+ 1691 - 2549
src/main/java/com/winhc/bigdata/udf/ComputeAdCode.java


+ 4 - 1
src/main/java/com/winhc/bigdata/udf/StringCleanupChangeInfo.java

@@ -13,15 +13,18 @@ public class StringCleanupChangeInfo extends UDF {
     private static final Pattern first_p = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z()()]");
     private static final Pattern second_p = Pattern.compile("[((][^((]+[))]$");
     private static final Pattern third_p = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z]");
+    private static final Pattern html_p = Pattern.compile("<[^>]+>");
 
     public String evaluate(String val) {
         if (StringUtils.isEmpty(val)) return "";
+        val = html_p.matcher(val).replaceAll("");
         val = first_p.matcher(val).replaceAll("");
         val = second_p.matcher(val).replaceAll("");
         return third_p.matcher(val).replaceAll("");
     }
 
     public static void main(String[] args) {
-        System.out.println(new StringCleanupChangeInfo().evaluate("f (xx()))===="));
+
+        System.out.println(new StringCleanupChangeInfo().evaluate("<b>企业基本信息:</b><b>投资人及出资信息:</b>出资方式:货币,出资方式:货币,出资比例:90.0,出资比例:10.0,认缴出资额:180.0,认缴出资额:20.0<br><b>公司基本信息(补充):</b>出资说明:所认缴的注册资本分期于公司成立之日起20年内缴足。"));
     }
 }

+ 27 - 0
src/main/java/com/winhc/bigdata/udf/StringCleanupChangeInfoContent.java

@@ -0,0 +1,27 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.regex.Pattern;
+
+/**
+ * @author ZhangJi
+ * @since 2022-03-30 13:35
+ */
+public class StringCleanupChangeInfoContent extends UDF {
+    private static final Pattern first_p = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z()()]");
+    private static final Pattern second_p = Pattern.compile("[((][^((]+[))]$");
+    private static final Pattern third_p = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z]");
+    private static final Pattern html_p = Pattern.compile("<[^>]+>");
+
+    public String evaluate(String val) {
+        if (StringUtils.isEmpty(val)) return "";
+        return html_p.matcher(val).replaceAll("");
+
+    }
+
+    public static void main(String[] args) {
+        System.out.println(new StringCleanupChangeInfoContent().evaluate("<b>企业基本信息:</b><b>投资人及出资信息:</b>出资方式:货币,出资方式:货币,出资比例:90.0,出资比例:10.0,认缴出资额:180.0,认缴出资额:20.0<br><b>公司基本信息(补充):</b>出资说明:所认缴的注册资本分期于公司成立之日起20年内缴足。"));
+    }
+}

+ 24 - 0
src/main/java/com/winhc/bigdata/udf/StringContainHtmlTag.java

@@ -0,0 +1,24 @@
+package com.winhc.bigdata.udf;
+
+import com.aliyun.odps.udf.UDF;
+import com.aliyun.odps.utils.StringUtils;
+
+import java.util.regex.Pattern;
+
+/**
+ * @author ZhangJi
+ * @since 2022-09-28 08:22
+ */
+public class StringContainHtmlTag extends UDF {
+    private static final Pattern html_p = Pattern.compile("<[^>]+>");
+    public Boolean evaluate(String val) {
+        if (StringUtils.isEmpty(val)) return false;
+        return html_p.matcher(val).find();
+    }
+
+    public static void main(String[] args) {
+        StringContainHtmlTag a = new StringContainHtmlTag();
+        System.out.println(a.evaluate("<b>工商登记联络人:</b><br>备案日期:20220121,null:1,null:,邮政编码:361000,null:,性别:男性,null:,null:,null:,null:,null:,姓名:李议麟<br><b>企业基本信息:</b>"));
+
+    }
+}